{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 03_calculate_PFs" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Determination of memory status is not supported on this \n", " platform, measuring for memoryleaks will never fail\n" ] } ], "source": [ "from pigeon_feather.data import *\n", "from pigeon_feather.plot import *\n", "from pigeon_feather.hxio import *\n", "from pigeon_feather.spectra import *\n", "\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import datetime\n", "import os\n", "import pickle\n", "import datetime" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of peptides with experimental max_d: 358\n", "Number of peptides with no experimental max_d: 12\n" ] } ], "source": [ "# load the pickle file we saved in the previous notebook\n", "\n", "\n", "today = datetime.date.today().strftime(\"%Y%m%d\")\n", "today = \"20240722\"\n", "\n", "with open(f\"./data/hdxms_data_raw_{today}.pkl\", \"rb\") as f:\n", " hdxms_data_list = pickle.load(f)\n", "\n", "\n", "# back exchange correction for peptides with experimental full deuteration data based its closest match in the database\n", "tools.backexchange_correction(hdxms_data_list)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# make folders for results\n", "\n", "today_date = datetime.date.today().strftime(\"%Y%m%d\")\n", "# today_date = '20240722'\n", "results_path = f\"ecDHFR_results_{today_date}\"\n", "\n", "if not os.path.exists(results_path):\n", " os.makedirs(results_path)\n", "\n", "\n", "out_path = \"./data/PF_input_20240722\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## peptide subtraction" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "117 new peptides added to the APO state.\n", "127 new peptides added to the TRI state.\n" ] } ], "source": [ "# [state.add_all_subtract() for data in hdxms_data_list for state in data.states]\n", "# add_new_peptides_by_subtract()\n", "for i in range(1):\n", " [\n", " state.add_new_peptides_by_subtract()\n", " for data in hdxms_data_list[:]\n", " for state in data.states\n", " ]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[117, 127]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[state.num_subtracted_added for data in hdxms_data_list for state in data.states]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "save the data as a pickle file for later use, and write to files used for bayesian sampling" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "with open(f\"{out_path}/hdxms_data_list.pkl\", \"wb\") as f:\n", " pickle.dump(hdxms_data_list, f)\n", "\n", "# with open(f\"{out_path}/hdxms_data_list.pkl\", \"rb\") as f:\n", "# hdxms_data_list = pickle.load(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## inputs for MCMC sampling" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Isotope files saved to ./data/PF_input_20240722/spectra_dhfr_tutorial_dataset\n", "Reminder: sequence contains fastamides !!!\n", "Reminder: sequence contains fastamides !!!\n", "Data saved to ./data/PF_input_20240722\n", "dhfr_tutorial_dataset\n" ] } ], "source": [ "exp_names = [\n", " \"dhfr_tutorial_dataset\",\n", "]\n", "\n", "for i in range(len(hdxms_data_list)):\n", " # exp_name = raw_spectra_paths[i].split('/')[-2].split('SpecExport_')[-1]\n", " exp_name = exp_names[i]\n", " export_iso_files(\n", " hdxms_data_list[i], outdir=f\"{out_path}/spectra_{exp_name}\", overwrite=True\n", " )\n", " df = revert_hdxmsdata_to_dataframe(hdxms_data_list[i])\n", " convert_dataframe_to_bayesianhdx_format(\n", " df, protein_name=exp_name, OUTPATH=f\"{out_path}\"\n", " )\n", "\n", " print(exp_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "write ready to run script for each state" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "protein_sequence = \"MTGHHHHHHENLYFQSISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR\"\n", "\n", "state_names = list(\n", " set([state.state_name for data in hdxms_data_list for state in data.states])\n", ")\n", "for protein_state in state_names:\n", " script = tools.generate_bayesian_hdx_script(\n", " exp_names,\n", " protein_sequence,\n", " protein_state,\n", " base_directory=\".\",\n", " making_chunks=True,\n", " pH=7.0,\n", " temperature=293.0,\n", " saturation=0.9,\n", " rerun_num=3,\n", " extreme_value_prior=False,\n", " structural_prior=False,\n", " )\n", "\n", " with open(f\"{out_path}/run_bayesian_hdx_{protein_state}_chunks.py\", \"w\") as f:\n", " f.write(script)\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## two priors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure you generate the priors if you enable twp priors in `tools.generate_bayesian_hdx_script`." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# uptake prior\n", "#tools.generate_extreme_value_prior(hdxms_data_list, out_path)\n", "\n", "\n", "# structural prior\n", "\n", "# solvated_pdbs = [\n", "# \"./data/5DFR_APO_relaxed_best_solvated.pdb\",\n", "# \"./data/6XG5_TRI_relaxed_best_solvated.pdb\",\n", "# \"./data/1RG7_MTX_relaxed_best_solvated.pdb\",\n", "# ]\n", "\n", "# for i, state_name in enumerate([\"APO\", \"TRI\", \"MTX\"]):\n", "\n", "\n", "# tools.generate_structural_prior(\n", "# protein_sequence, solvated_pdbs[i], out_path, state_name\n", "# )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can run the script in the terminal with the following command:\n", "\n", "```sh\n", "cd ./data/bayesian_hdx_input_20240722\n", "python run_bayesian_hdx_APO_chunks.py\n", "```\n", "\n", "The simulations usually take several hours, but the duration varies based on the size of the dataset (number of peptides, time points, and replicates).\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "pigeon_feather", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }