05_plot_the_results

[1]:
from pigeon_feather.data import *
from pigeon_feather.plot import *
from pigeon_feather.hxio import *
from pigeon_feather.spectra import *


import numpy as np
import pandas as pd

import datetime
import os
import pickle
import datetime
Determination of memory status is not supported on this
 platform, measuring for memoryleaks will never fail
[2]:
# load the pickle file we saved in the previous notebook


today = datetime.date.today().strftime("%Y%m%d")
today = "20240722"

with open(f"./data/hdxms_data_raw_{today}.pkl", "rb") as f:
    hdxms_data_list = pickle.load(f)


# back exchange correction for peptides with experimental full deuteration data based its closest match in the database
tools.backexchange_correction(hdxms_data_list)
Number of peptides with experimental max_d: 358
Number of peptides with no experimental max_d: 12
[3]:
# make folders for results

today_date = datetime.date.today().strftime("%Y%m%d")
# today_date = '20240722'
results_path = f"ecDHFR_results_{today_date}"

if not os.path.exists(results_path):
    os.makedirs(results_path)


out_path = "./data/PF_input_20240722"

Create an Analysis object and load the results. Note: The temperature and pH are crucial for calculating the intrinsic exchange rates, so make sure to input the correct values. The chunk size and chunk number can be altered by the user. In this tutorial, they were automatically determined by a helper function.

[4]:
from pigeon_feather.analysis import Analysis, get_index_offset
from pigeon_feather.tools import optimal_chunks
import matplotlib.pyplot as plt

RUN_NUM = 3

apo_states = [
    state
    for data in hdxms_data_list
    for state in data.states
    if state.state_name == "APO"
]

tri_states = [
    state
    for data in hdxms_data_list
    for state in data.states
    if state.state_name == "TRI"
]

ana_apo_1 = Analysis(apo_states, temperature=293.0, pH=7.0)

chunk_num, chunk_size = optimal_chunks(len(ana_apo_1.protein_sequence))
ana_apo_1.load_bayesian_hdx_oupt_chunks(
    chunk_size=chunk_size,
    chunk_num=chunk_num,
    state_name="APO",
    run_num=RUN_NUM,
    N=200,
    bayesian_hdx_data_folder=f"{out_path}/bayesian_hdx_output_chunks",
)


ana_tri_1 = Analysis(tri_states, temperature=293.0, pH=7.0)

ana_tri_1.load_bayesian_hdx_oupt_chunks(
    chunk_size=chunk_size,
    chunk_num=chunk_num,
    state_name="TRI",
    run_num=RUN_NUM,
    N=200,
    bayesian_hdx_data_folder=f"{out_path}/bayesian_hdx_output_chunks",
)

barplot of the kex

barplot showing the FEATHER derived exchange rates. index_offset is determined by comparison to the sequence in HX/MS and the pdb file provided.

[5]:
# APO state only

import numpy as np


fig, axes = plt.subplots(2, 1, figsize=(40, 10), sharey=True, sharex=True)

ana_apo_1.plot_kex_bar(
    ax=axes[0], resolution_indicator_pos=15, label="APO", show_seq=False, color="k"
)

ana_tri_1.plot_kex_bar(
    ax=axes[1], resolution_indicator_pos=15, label="TRI", show_seq=False, color="red"
)

from matplotlib.pyplot import step
from matplotlib.ticker import FuncFormatter, MultipleLocator

from pigeon_feather.analysis import get_index_offset

pdb_file = "./data/6XG5_TRI.pdb"
index_offset = get_index_offset(ana_apo_1, pdb_file)

# # ax.xaxis.set_major_locator(plt.MultipleLocator(2))
axes[0].set_xticks(axes[0].get_xticks()[::2])
axes[0].set_xticklabels(axes[0].get_xticklabels(), fontdict={"fontsize": 24})


def format_func(value, tick_number):
    return f"{int(value - index_offset +1)}"


axes[0].xaxis.set_major_formatter(FuncFormatter(format_func))

seq_pos = 17
for ii in range(0, len(ana_apo_1.protein_sequence[:]), 2):
    axes[0].text(
        ii,
        seq_pos,
        ana_apo_1.protein_sequence[ii],
        ha="center",
        va="center",
        fontsize=22,
    )


for ax in axes:
    ax.set_ylabel("-log(k$_{ex}$)")
    #ax.legend(loc="lower left")
    ax.yaxis.set_major_locator(plt.MultipleLocator(5))

    ax.plot(
        [-3, len(ana_apo_1.protein_sequence) + 3],
        [5, 5],
        color="gray",
        lw=1,
        linestyle="--",
        alpha=0.5,
    )
    ax.plot(
        [-3, len(ana_apo_1.protein_sequence) + 3],
        [10, 10],
        color="gray",
        lw=1,
        linestyle="--",
        alpha=0.5,
    )


plt.subplots_adjust(hspace=0.08)
fig.savefig(f"{results_path}/ecDHFR_kex_bar_APO_TRI_{today_date}.pdf")
../_images/tutorials_05_plot_the_results_8_0.png

logP projection to a pdb structure

[6]:
from pigeon_feather.analysis import BFactorPlot
[7]:
bfactor_plot = BFactorPlot(
    ana_apo_1,
    pdb_file="./data/6XG5_TRI.pdb",
    plot_deltaG=True,
    temperature=293.15,
    logP_threshold=10,
)
bfactor_plot.plot(f"{results_path}/6XG5_APO_deltaG.pdb")

# bfactor_plot = BFactorPlot(
#     ana_apo_1,
#     pdb_file="./data/6XG5_APO_relaxed_best_solvated.pdb",
#     plot_deltaG=True,
#     temperature=293.15,
# )
# bfactor_plot.plot(f"{out_path}/6XG5_APO_deltaG.pdb")

delta log(PF)s

if there are mutiple states availble, you can map the difference to the pdb strutcure

[8]:
bfactor_plot = BFactorPlot(
    ana_apo_1,
    ana_tri_1,
    pdb_file="./data/6XG5_TRI.pdb",
    plot_deltaG=True,
    temperature=293.15,
    logP_threshold=10,
)
bfactor_plot.plot(f"{results_path}/6XG5_deltaG_APO-TRI.pdb")

You can visualize the results in PyMOL by running the following commands:

spectrum b, green_white_magenta, minimum=-20, maximum=20; select nans, not (b=0 or b>0 or b<0); color grey20, nans;

nans are Prolines and residues with no coverage.

export logPFs to a csv

[9]:
import MDAnalysis
from pigeon_feather.analysis import get_res_avg_logP, get_res_avg_logP_std

pdb_file = "./data/6XG5_TRI.pdb"
index_offset = get_index_offset(ana_apo_1, pdb_file)


def logPF_to_deltaG(ana_obj, logPF):
    """
    :param logPF: logP value
    :return: deltaG in kJ/mol, local unfolding energy
    """

    return 8.3145 * ana_obj.temperature * np.log(10) * logPF / 1000


def create_logP_df(ana_obj, index_offset):
    df_logPF = pd.DataFrame()

    for res_i, _ in enumerate(ana_obj.results_obj.protein_sequence):
        res_obj_i = ana_obj.results_obj.get_residue_by_resindex(res_i)

        avg_logP, std_logP = get_res_avg_logP(res_obj_i)
        #std_logP = get_res_avg_logP_std(res_obj_i)

        df_i = pd.DataFrame(
            {
                "resid": [res_obj_i.resid - index_offset],
                #"resname": [res_obj_i.resname],
                "resname": [MDAnalysis.lib.util.convert_aa_code(res_obj_i.resname)],
                'avg_dG': [logPF_to_deltaG(ana_obj, avg_logP)],
                'std_dG': [logPF_to_deltaG(ana_obj, std_logP)],
                "avg_logP": [avg_logP],
                "std_logP": [max(std_logP, 0.35)],
                "is_nan": [res_obj_i.is_nan()],
            }
        )

        if res_obj_i.is_nan():
            df_i["single resloved"] = [np.nan]
            df_i["min_pep logPs"] = [np.nan]

        else:
            df_i["single resloved"] = [res_obj_i.mini_pep.if_single_residue()]
            df_i["min_pep logPs"] = [res_obj_i.clustering_results_logP]

        df_logPF = pd.concat([df_logPF, df_i])

    df_logPF = df_logPF.reset_index(drop=True)

    return df_logPF


df = create_logP_df(ana_apo_1, index_offset)
df.to_csv(f"{results_path}/logP_APO_{today_date}.csv", index=False)

[10]:
# fecth two df and merge
df_apo = create_logP_df(ana_apo_1, index_offset)
df_tri = create_logP_df(ana_tri_1, index_offset)

df = pd.merge(df_apo, df_tri, on="resid", suffixes=("_APO", "_TRI"))

df.to_csv(f"{results_path}/logP_APO_TRI_{today_date}.csv", index=False)
[ ]: