03_calculate_PFs
[1]:
from pigeon_feather.data import *
from pigeon_feather.plot import *
from pigeon_feather.hxio import *
from pigeon_feather.spectra import *
import numpy as np
import pandas as pd
import datetime
import os
import pickle
import datetime
Determination of memory status is not supported on this
platform, measuring for memoryleaks will never fail
[2]:
# load the pickle file we saved in the previous notebook
today = datetime.date.today().strftime("%Y%m%d")
today = "20240722"
with open(f"./data/hdxms_data_raw_{today}.pkl", "rb") as f:
hdxms_data_list = pickle.load(f)
# back exchange correction for peptides with experimental full deuteration data based its closest match in the database
tools.backexchange_correction(hdxms_data_list)
Number of peptides with experimental max_d: 358
Number of peptides with no experimental max_d: 12
[3]:
# make folders for results
today_date = datetime.date.today().strftime("%Y%m%d")
# today_date = '20240722'
results_path = f"ecDHFR_results_{today_date}"
if not os.path.exists(results_path):
os.makedirs(results_path)
out_path = "./data/PF_input_20240722"
peptide subtraction
[4]:
# [state.add_all_subtract() for data in hdxms_data_list for state in data.states]
# add_new_peptides_by_subtract()
for i in range(1):
[
state.add_new_peptides_by_subtract()
for data in hdxms_data_list[:]
for state in data.states
]
117 new peptides added to the APO state.
127 new peptides added to the TRI state.
[9]:
[state.num_subtracted_added for data in hdxms_data_list for state in data.states]
[9]:
[117, 127]
save the data as a pickle file for later use, and write to files used for bayesian sampling
[10]:
with open(f"{out_path}/hdxms_data_list.pkl", "wb") as f:
pickle.dump(hdxms_data_list, f)
# with open(f"{out_path}/hdxms_data_list.pkl", "rb") as f:
# hdxms_data_list = pickle.load(f)
inputs for MCMC sampling
[11]:
exp_names = [
"dhfr_tutorial_dataset",
]
for i in range(len(hdxms_data_list)):
# exp_name = raw_spectra_paths[i].split('/')[-2].split('SpecExport_')[-1]
exp_name = exp_names[i]
export_iso_files(
hdxms_data_list[i], outdir=f"{out_path}/spectra_{exp_name}", overwrite=True
)
df = revert_hdxmsdata_to_dataframe(hdxms_data_list[i])
convert_dataframe_to_bayesianhdx_format(
df, protein_name=exp_name, OUTPATH=f"{out_path}"
)
print(exp_name)
Isotope files saved to ./data/PF_input_20240722/spectra_dhfr_tutorial_dataset
Reminder: sequence contains fastamides !!!
Reminder: sequence contains fastamides !!!
Data saved to ./data/PF_input_20240722
dhfr_tutorial_dataset
write ready to run script for each state
[12]:
protein_sequence = "MTGHHHHHHENLYFQSISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR"
state_names = list(
set([state.state_name for data in hdxms_data_list for state in data.states])
)
for protein_state in state_names:
script = tools.generate_bayesian_hdx_script(
exp_names,
protein_sequence,
protein_state,
base_directory=".",
making_chunks=True,
pH=7.0,
temperature=293.0,
saturation=0.9,
rerun_num=3,
extreme_value_prior=False,
structural_prior=False,
)
with open(f"{out_path}/run_bayesian_hdx_{protein_state}_chunks.py", "w") as f:
f.write(script)
two priors
Make sure you generate the priors if you enable twp priors in tools.generate_bayesian_hdx_script
.
[13]:
# uptake prior
#tools.generate_extreme_value_prior(hdxms_data_list, out_path)
# structural prior
# solvated_pdbs = [
# "./data/5DFR_APO_relaxed_best_solvated.pdb",
# "./data/6XG5_TRI_relaxed_best_solvated.pdb",
# "./data/1RG7_MTX_relaxed_best_solvated.pdb",
# ]
# for i, state_name in enumerate(["APO", "TRI", "MTX"]):
# tools.generate_structural_prior(
# protein_sequence, solvated_pdbs[i], out_path, state_name
# )
You can run the script in the terminal with the following command:
cd ./data/bayesian_hdx_input_20240722
python run_bayesian_hdx_APO_chunks.py
The simulations usually take several hours, but the duration varies based on the size of the dataset (number of peptides, time points, and replicates).