00_Showcase ecDHFR: a FEATHER walkthrough

The tutorial dataset can be downloaded from this link.

import

[1]:

from pigeon_feather.data import *
from pigeon_feather.plot import *
from pigeon_feather.hxio import *
from pigeon_feather.spectra import *


import numpy as np
import pandas as pd

[3]:

# make folders for results

import datetime
import os

out_path = "./data/bayesian_hdx_input_20240722"

today_date = datetime.date.today().strftime("%Y%m%d")
# today_date = '20240722'
results_path = os.path.join(out_path, f"ecDHFR_results_{today_date}")

if not os.path.exists(results_path):
    os.makedirs(results_path)

load the data

[2]:

tables = ['./data/ecDHFR_tutorial.csv']

ranges = ['./data/rangeslist.csv']


raw_spectra_paths = [
    f"./data/SpecExport/",
]

protein_sequence = "MTGHHHHHHENLYFQSISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR"

# load the data
hdxms_data_list = []
for i in range(len(tables)):
    # for i in [4]:
    print(tables[i])


    # read the data and clean it
    cleaned = read_hdx_tables([tables[i]], [ranges[i]], exclude=False, states_subset=['APO','TRI'])

    # convert the cleaned data to hdxms data object
    hdxms_data = load_dataframe_to_hdxmsdata(
        cleaned,
        n_fastamides=2,
        protein_sequence=protein_sequence,
        fulld_approx=False,
        saturation=0.9,
    )

    # load the raw ms data to the hdxms data object
    load_raw_ms_to_hdxms_data(
        hdxms_data,
        raw_spectra_paths[i],
    )

    hdxms_data_list.append(hdxms_data)

./data/ecDHFR_tutorial.csv
rangeslist included !
Removed 0 peptides from state APO due to missing raw MS data.
Removed 70 peptides from state APO due to high back exchange.
Removed 2 peptides from state TRI due to missing raw MS data.
Removed 70 peptides from state TRI due to high back exchange.
Done loading raw MS data.

[3]:

hdxms_data_list

[3]:

[<pigeon_feather.data.HDXMSData at 0x7f6ccdb68fd0>]

[4]:

# save the raw data as a pickle file
import pickle

today = datetime.date.today().strftime("%Y%m%d")
today = "20240722"

# with open(f"./data/hdxms_data_raw_{today}.pkl", "wb") as f:
#     pickle.dump(hdxms_data_list, f)

with open(f"./data/hdxms_data_raw_{today}.pkl", "rb") as f:
    hdxms_data_list = pickle.load(f)

[5]:

from pigeon_feather.hxio import get_all_statics_info

get_all_statics_info(hdxms_data_list)

============================================================
                    HDX-MS Data Statistics
============================================================
States names: ['APO', 'TRI']
Time course (s): [46.0, 373.5, 572.5, 2011.0, 7772.0, 30811.5, 43292.0]
Number of time points: 7
Protein sequence length: 174
Average coverage: 0.93
Number of unique peptides: 186
Average peptide length: 9.4
Redundancy (based on average coverage): 10.0
Average peptide length to redundancy ratio: 0.9
Backexchange average, IQR: 0.19, 0.15
============================================================

back exchange correction

back exchange correction for peptides with experimental full deuteration data based its closest match in the database

[6]:

tools.backexchange_correction(hdxms_data_list)

Number of peptides with experimental max_d: 358
Number of peptides with no experimental max_d: 12

[7]:

# grab all the peptides

all_peps = [
    pep
    for data in hdxms_data_list[:]
    for state in data.states
    for pep in state.peptides
]

len(set([pep.identifier for pep in all_peps]))

[7]:

[8]:

max_d = [pep.max_d / pep.theo_max_d for pep in all_peps]
plt.hist(max_d, bins=20)

[8]:

(array([18., 16., 10., 20., 30., 20., 42., 20., 24., 40., 38., 28., 20.,
        14.,  8.,  6.,  4.,  2.,  2.,  8.]),
 array([0.60127572, 0.62628601, 0.6512963 , 0.67630658, 0.70131687,
        0.72632716, 0.75133745, 0.77634774, 0.80135802, 0.82636831,
        0.8513786 , 0.87638889, 0.90139918, 0.92640947, 0.95141975,
        0.97643004, 1.00144033, 1.02645062, 1.05146091, 1.07647119,
        1.10148148]),
 <BarContainer object of 20 artists>)

../_images/tutorials_00_ecdhfr_showcase_14_1.png

plot uptake plots

[11]:

# option 1
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
import matplotlib.ticker as ticker
import matplotlib.lines as mlines
from pigeon_feather.analysis import get_index_offset
from matplotlib.ticker import LogLocator


font = {"family": "Arial", "weight": "normal", "size": 36}
axes = {"titlesize": 36, "titleweight": "bold", "labelsize": 36, "linewidth": 5}
plt.rc("font", **font)
plt.rc("axes", **axes)
plt.rc("lines", lw=5)

colors = ["k", "red", "blue", "purple", "gray", "orange", "yellow", "green", "brown"]


all_peps = [
    peptide
    for hdxms_data in hdxms_data_list
    for state in hdxms_data.states
    for peptide in state.peptides
    if peptide.note is None and state.state_name != "RAT"
]
all_idfs = [pep for pep in all_peps if pep.unique_num_timepoints > 5]
all_idfs = list(set([peptide.identifier for peptide in all_peps]))[:]
all_idfs.sort(key=lambda x: int(re.search(r"(-?\d+)", x).group()))



def idf_to_pep(idf):
    return [pep for pep in all_peps if pep.identifier == idf][0]

# num_subplots_per_figure = 100
num_subplots_per_figure = 250
num_figures = math.ceil(len(all_idfs) / num_subplots_per_figure)


all_idfs_subset = all_idfs[:]

for fig_index in range(num_figures):
    # Select the subset of errors for the current figure
    selected_idf = all_idfs_subset[
        fig_index * num_subplots_per_figure : (fig_index + 1) * num_subplots_per_figure
    ]
    num_col = math.ceil(len(selected_idf) / 5)

    fig, axs = plt.subplots(
        num_col, 5, figsize=(9 * 5, 8 * num_col)
    )  # Adjust subplot size as needed

    for i, idf in enumerate(selected_idf):
        ax = axs[i // 5, i % 5]


        pep = idf_to_pep(idf)
        ax.axhline(y=pep.max_d, color='lightgray', linestyle='--', linewidth=5)


        uptake = UptakePlot(
            hdxms_data_list,
            idf,
            states_subset=["APO", "TRI"],
            if_plot_fit=False,
            figure=fig,
            ax=ax,
        )

        ax.set_xlim(1e1, 1e5)
        ax.xaxis.set_major_locator(LogLocator(base=10.0, numticks=5))


        y_max = pep.theo_max_d/hdxms_data_list[0].saturation
        ax.set_ylim(-0.5, y_max + 0.5)


        #Custom legend
        handles, labels = ax.get_legend_handles_labels()
        new_labels = [label for label in labels if label in ["APO", "TRI"]]
        new_handles = [
            handle
            for handle, label in zip(handles, labels)
            if label in ["APO", "TRI"]
        ]
        ax.legend(new_handles, new_labels, title="state", title_fontsize="small",loc='best')


    # Layout adjustment and save
    fig.tight_layout()
    fig.savefig(f"{results_path}/DHFR_exp_uptake_{fig_index}.pdf")

peptide subtraction

[9]:

# [state.add_all_subtract() for data in hdxms_data_list for state in data.states]
# add_new_peptides_by_subtract()
for i in range(1):
    [
        state.add_new_peptides_by_subtract()
        for data in hdxms_data_list[:]
        for state in data.states
    ]

117 new peptides added to the APO state.
126 new peptides added to the TRI state.

[10]:

[state.num_subtracted_added for data in hdxms_data_list for state in data.states]

[10]:

[117, 126]

save the data as a pickle file for later use, and write to files used for bayesian sampling

[11]:

import os

out_path = f"./data/bayesian_hdx_input_{today}"
if not os.path.exists(out_path):
    os.makedirs(out_path)


import pickle

with open(f"{out_path}/hdxms_data_list.pkl", "wb") as f:
    pickle.dump(hdxms_data_list, f)

# with open(f"{out_path}/hdxms_data_list.pkl", "rb") as f:
#     hdxms_data_list = pickle.load(f)

inputs for MCMC sampling

[12]:

exp_names = [
    "dhfr_tutorial_dataset",
]

for i in range(len(hdxms_data_list)):
    # exp_name = raw_spectra_paths[i].split('/')[-2].split('SpecExport_')[-1]
    exp_name = exp_names[i]
    export_iso_files(
        hdxms_data_list[i], outdir=f"{out_path}/spectra_{exp_name}", overwrite=True
    )
    df = revert_hdxmsdata_to_dataframe(hdxms_data_list[i])
    convert_dataframe_to_bayesianhdx_format(
        df, protein_name=exp_name, OUTPATH=f"{out_path}"
    )

    print(exp_name)

Isotope files saved to ./data/bayesian_hdx_input_20240722/spectra_dhfr_tutorial_dataset
Reminder: sequence contains fastamides !!!
Reminder: sequence contains fastamides !!!
Data saved to ./data/bayesian_hdx_input_20240722
dhfr_tutorial_dataset

[15]:

# write ready to run script for each state

protein_sequence = "MTGHHHHHHENLYFQSISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR"

state_names = list(
    set([state.state_name for data in hdxms_data_list for state in data.states])
)
for protein_state in state_names:
    script = tools.generate_bayesian_hdx_script(
        exp_names,
        protein_sequence,
        protein_state,
        base_directory=".",
        making_chunks=True,
        pH=7.0,
        temperature=293.0,
        saturation=0.9,
        rerun_num=3,
        extreme_value_prior=False,
        structural_prior=False,
    )

    with open(f"{out_path}/run_bayesian_hdx_{protein_state}_chunks.py", "w") as f:
        f.write(script)

two priors

[14]:

# uptake prior
#tools.generate_extreme_value_prior(hdxms_data_list, out_path)


# structural prior

# solvated_pdbs = [
#     "./data/5DFR_APO_relaxed_best_solvated.pdb",
#     "./data/6XG5_TRI_relaxed_best_solvated.pdb",
#     "./data/1RG7_MTX_relaxed_best_solvated.pdb",
# ]

# for i, state_name in enumerate(["APO", "TRI", "MTX"]):


#     tools.generate_structural_prior(
#         protein_sequence, solvated_pdbs[i], out_path, state_name
#     )

run the sampling

You can run the script in the terminal with the following command:

cd ./data/bayesian_hdx_input_20240722
python run_bayesian_hdx_APO_chunks.py

The simulations usually take several hours, but the duration varies based on the size of the dataset (number of peptides, time points, and replicates).

Analysis

[13]:

import pickle

class CustomUnpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if name == "ProteinState":
            from pigeon_feather.data import ProteinState

            return ProteinState
        if name == "HDXMSData":
            from pigeon_feather.data import HDXMSData

            return HDXMSData
        if name == "Peptide":
            from pigeon_feather.data import Peptide

            return Peptide
        if name == "Timepoint":
            from pigeon_feather.data import Timepoint

            return Timepoint
        return super().find_class(module, name)


# read pcikle
with open(f"{out_path}/hdxms_data_list.pkl", "rb") as f:
    hdxms_data_list = CustomUnpickler(f).load()

[15]:

hdxms_data_list

[15]:

[<pigeon_feather.data.HDXMSData at 0x7f095a02d150>]

[16]:

for data in hdxms_data_list:
    for state in data.states:
        state.peptides = [pep for pep in state.peptides if pep.note is None]
        for pep in state.peptides:
            pep.timepoints = [tp for tp in pep.timepoints if tp.note is None]

create an Analysis object and load the results, note: the temperature and pH are crucial to calculate the instrisic exchange rates. make sure to input the correct ones. The chunk size and chunk number can be altered by user. in this tutorial it was automaticaly determined by a help function.

[17]:

from pigeon_feather.analysis import Analysis, get_index_offset
from pigeon_feather.tools import optimal_chunks
import matplotlib.pyplot as plt

RUN_NUM = 3

apo_states = [
    state
    for data in hdxms_data_list
    for state in data.states
    if state.state_name == "APO"
]

tri_states = [
    state
    for data in hdxms_data_list
    for state in data.states
    if state.state_name == "TRI"
]

ana_apo_1 = Analysis(apo_states, temperature=293.0, pH=7.0)

chunk_num, chunk_size = optimal_chunks(len(ana_apo_1.protein_sequence))
ana_apo_1.load_bayesian_hdx_oupt_chunks(
    chunk_size=chunk_size,
    chunk_num=chunk_num,
    state_name="APO",
    run_num=RUN_NUM,
    N=200,
    bayesian_hdx_data_folder=f"{out_path}/bayesian_hdx_output_chunks",
)


ana_tri_1 = Analysis(tri_states, temperature=293.0, pH=7.0)

ana_tri_1.load_bayesian_hdx_oupt_chunks(
    chunk_size=chunk_size,
    chunk_num=chunk_num,
    state_name="TRI",
    run_num=RUN_NUM,
    N=200,
    bayesian_hdx_data_folder=f"{out_path}/bayesian_hdx_output_chunks",
)

[18]:

# for mini in ana_apo_1.results_obj.mini_peps:
#     print(mini.std_within_clusters_log_kex)

Fitting check

two fitting check functions are available: 1. centroid level fitting check 2. isotopic mass envelope fitting check.

[19]:

from pigeon_feather.analysis import (
    check_fitted_isotope_envelope,
    check_fitted_peptide_uptake,
)

envelope check

[20]:

check_state_name = "APO"
check_ana_obj = ana_apo_1


all_peps = [
    pep
    for data in hdxms_data_list
    for state in data.states
    for pep in state.peptides
    if state.state_name == check_state_name and pep.note is None
]
all_tps = [
    tp
    for pep in all_peps
    for tp in pep.timepoints
    if pep.get_timepoint(0) is not None and tp.deut_time != np.inf and tp.deut_time != 0
]

[21]:

envelope_errors = [(check_fitted_isotope_envelope(ana_apo_1, tp), tp) for tp in all_tps]
envelope_errors = sorted(envelope_errors, key=lambda x: x[0], reverse=False)

# you can plot the fitted isotope envelope and the experimental data
check_fitted_isotope_envelope(check_ana_obj, envelope_errors[300][1], if_plot=True)

[21]:

0.18086398727111275

../_images/tutorials_00_ecdhfr_showcase_41_1.png

[22]:

plt.hist(
    np.array(envelope_errors)[:, 0],
)
plt.xlabel("Sum AE")
plt.ylabel("Count")

[22]:

Text(0, 0.5, 'Count')

../_images/tutorials_00_ecdhfr_showcase_42_1.png

[23]:

print(np.nanmedian(np.array(envelope_errors)[:, 0]))
print(np.nanmean(np.array(envelope_errors)[:, 0]))

0.3646019382844139
0.39348193324578395

uptake check

[24]:

from pigeon_feather.analysis import check_fitted_peptide_uptake

all_idfs = list(set([pep.identifier for pep in all_peps]))

def extract_numbers(s):
    numbers = re.findall(r"(-?\d+)-(-?\d+)", s)
    return tuple(map(int, numbers[0]))

all_idfs.sort(key=extract_numbers)

[25]:

uptake_errors = []

all_peps_grouped = tools.group_by_attributes(
    all_peps, ["protein_state.state_name", "identifier"]
)

for idf in all_idfs:
    try:
        idf_peps = all_peps_grouped[(check_state_name, idf)]
        avg_pep = tools.average_peptides(idf_peps)

        result = check_fitted_peptide_uptake(
            check_ana_obj, hdxms_data_list, avg_pep, state_name=check_state_name
        )
        uptake_errors.append((result, avg_pep))
    except Exception as e:
        print(idf, e)


uptake_errors_array = np.array([i[0] for i in uptake_errors])

[26]:

import seaborn as sns


fig, axes = plt.subplots(1, 2, figsize=(12, 5.5))

# # Plotting with complementary colors
# sns.histplot([x[0] for x in envelope_errors], bins=20, kde=True, ax=axes[0], color="#FF6347")
# sns.histplot(uptake_errors_array, bins=20, kde=True, ax=axes[1], color="#4682B4")


color_1 = "#53b1b1"  # A standard blue color
color_2 = "#f6c624"  # A teal color

# Plotting with chosen colors
sns.histplot([x[0] for x in envelope_errors], bins=20, kde=True, ax=axes[0], color=color_1)
sns.histplot(uptake_errors_array, bins=20, kde=True, ax=axes[1], color=color_2)

axes[1].set_ylabel("")
axes[0].set_xlabel("Sum AE")
axes[1].set_xlabel("Da/peptide length")
fig.subplots_adjust(wspace=0.2)

../_images/tutorials_00_ecdhfr_showcase_47_0.png

[27]:

from pigeon_feather.analysis import get_index_offset
from matplotlib.ticker import LogLocator


check_ana_obj = ana_apo_1
index_offset = get_index_offset(check_ana_obj, './data/6XG5_TRI.pdb')

uptake_errors = sorted(uptake_errors, key=lambda x: x[1].start, reverse=False)

num_subplots_per_figure = math.ceil(len(uptake_errors) / 2)
# num_subplots_per_figure = 250
num_figures = math.ceil(len(all_idfs) / num_subplots_per_figure)


for fig_index in range(num_figures):
    # Select the subset of errors for the current figure
    selected_uptake_errors = uptake_errors[fig_index * num_subplots_per_figure:(fig_index + 1) * num_subplots_per_figure]
    num_col = math.ceil(len(selected_uptake_errors) / 5)

    fig, axs = plt.subplots(num_col, 5, figsize=(9 * 5, 8 * num_col))  # Adjust subplot size as needed

    for i, error_tuple in enumerate(selected_uptake_errors):


        ax = axs[i // 5, i % 5]


        # Unpack error information
        peptide_data = error_tuple[1]


        ax.axhline(y=peptide_data.max_d, color='lightgray', linestyle='--', linewidth=5)

        check_fitted_peptide_uptake(
            check_ana_obj,
            hdxms_data_list,
            peptide_data,
            if_plot=True,
            state_name=check_state_name,
            figure=fig,
            ax=ax
        )

        #Retrieve and format the peptide identifier
        idf = peptide_data.identifier
        idf_start, idf_end = map(int, re.match(r"(-?\d+)-(-?\d+)", idf).groups())
        idf_seq = idf.split(" ")[1]
        ax.set_title(f"{idf_start - index_offset}-{idf_end - index_offset} {idf_seq}")

        ax.set_xlim(1e1, 1e5)
        ax.xaxis.set_major_locator(LogLocator(base=10.0, numticks=5))

        # y_max = ax.get_ylim()[1]
        # ax.set_ylim(-0.5, y_max + 1)
        pep = error_tuple[1]
        y_max = pep.theo_max_d/check_ana_obj.saturation
        ax.set_ylim(-0.5, y_max + 0.5)

        # light gray dotted line at max deuteration


        handles, labels = ax.get_legend_handles_labels()
        new_labels = [label for label in labels if label.isdigit()]
        new_handles = [handle for handle, label in zip(handles, labels) if label.isdigit()]
        ax.legend(new_handles, new_labels, title='replicate', title_fontsize='small')

    # Layout adjustment and save
    fig.tight_layout()
    fig.savefig(f"{results_path}/ecDHFR_uptake_errors_{check_state_name}_{fig_index}.pdf")

if the fitting is bad, one should not move forward, need to check the parameters in the sampling, make sure they agree with the experimental conditions. one may also remove the outlier peptides and run the all the cells above again. a ourliter may be a peptide exchange a lot while its neighbours barely exchange (examples in the turtorial dataset: 78-92 VDEAIAACGDVPEIM, 75-78 VKSV)

once you’re satisfied with the fitting results, you can make the final results plots

barplot of the kex

barplot showing the FEATHER derived exchange rates. index_offset is determined by comparison to the sequence in HX/MS and the pdb file provided.

[28]:

# APO state only

import numpy as np


fig, axes = plt.subplots(2, 1, figsize=(40, 10), sharey=True, sharex=True)

ana_apo_1.plot_kex_bar(
    ax=axes[0], resolution_indicator_pos=15, label="APO", show_seq=False, color="k"
)

ana_tri_1.plot_kex_bar(
    ax=axes[1], resolution_indicator_pos=15, label="TRI", show_seq=False, color="red"
)

from matplotlib.pyplot import step
from matplotlib.ticker import FuncFormatter, MultipleLocator

from pigeon_feather.analysis import get_index_offset

pdb_file = "./data/6XG5_TRI.pdb"
index_offset = get_index_offset(ana_apo_1, pdb_file)

# # ax.xaxis.set_major_locator(plt.MultipleLocator(2))
axes[0].set_xticks(axes[0].get_xticks()[::2])
axes[0].set_xticklabels(axes[0].get_xticklabels(), fontdict={"fontsize": 24})


def format_func(value, tick_number):
    return f"{int(value - index_offset +1)}"


axes[0].xaxis.set_major_formatter(FuncFormatter(format_func))

seq_pos = 17
for ii in range(0, len(ana_apo_1.protein_sequence[:]), 2):
    axes[0].text(
        ii,
        seq_pos,
        ana_apo_1.protein_sequence[ii],
        ha="center",
        va="center",
        fontsize=22,
    )


for ax in axes:
    ax.set_ylabel("-log(k$_{ex}$)")
    #ax.legend(loc="lower left")
    ax.yaxis.set_major_locator(plt.MultipleLocator(5))

    ax.plot(
        [-3, len(ana_apo_1.protein_sequence) + 3],
        [5, 5],
        color="gray",
        lw=1,
        linestyle="--",
        alpha=0.5,
    )
    ax.plot(
        [-3, len(ana_apo_1.protein_sequence) + 3],
        [10, 10],
        color="gray",
        lw=1,
        linestyle="--",
        alpha=0.5,
    )


plt.subplots_adjust(hspace=0.08)
fig.savefig(f"{results_path}/ecDHFR_kex_bar_APO_TRI_{today_date}.pdf")

../_images/tutorials_00_ecdhfr_showcase_53_0.png

logP projection to a pdb structure

[29]:

from pigeon_feather.analysis import BFactorPlot

[30]:

bfactor_plot = BFactorPlot(
    ana_apo_1,
    pdb_file="./data/6XG5_TRI.pdb",
    plot_deltaG=True,
    temperature=293.15,
    logP_threshold=10,
)
bfactor_plot.plot(f"{results_path}/6XG5_APO_deltaG.pdb")

# bfactor_plot = BFactorPlot(
#     ana_apo_1,
#     pdb_file="./data/6XG5_APO_relaxed_best_solvated.pdb",
#     plot_deltaG=True,
#     temperature=293.15,
# )
# bfactor_plot.plot(f"{out_path}/6XG5_APO_deltaG.pdb")

delta logP

if there are mutiple states availble, one can map the difference to the pdb strutcure

[31]:

bfactor_plot = BFactorPlot(
    ana_apo_1,
    ana_tri_1,
    pdb_file="./data/6XG5_TRI.pdb",
    plot_deltaG=True,
    temperature=293.15,
    logP_threshold=10,
)
bfactor_plot.plot(f"{results_path}/6XG5_deltaG_APO-TRI.pdb")

export logPFs to a csv

[32]:

import MDAnalysis
from pigeon_feather.analysis import get_res_avg_logP, get_res_avg_logP_std

pdb_file = "./data/6XG5_TRI.pdb"
index_offset = get_index_offset(ana_apo_1, pdb_file)


def logPF_to_deltaG(ana_obj, logPF):
    """
    :param logPF: logP value
    :return: deltaG in kJ/mol, local unfolding energy
    """

    return 8.3145 * ana_obj.temperature * np.log(10) * logPF / 1000


def create_logP_df(ana_obj, index_offset):
    df_logPF = pd.DataFrame()

    for res_i, _ in enumerate(ana_obj.results_obj.protein_sequence):
        res_obj_i = ana_obj.results_obj.get_residue_by_resindex(res_i)

        avg_logP, std_logP = get_res_avg_logP(res_obj_i)
        #std_logP = get_res_avg_logP_std(res_obj_i)

        df_i = pd.DataFrame(
            {
                "resid": [res_obj_i.resid - index_offset],
                #"resname": [res_obj_i.resname],
                "resname": [MDAnalysis.lib.util.convert_aa_code(res_obj_i.resname)],
                'avg_dG': [logPF_to_deltaG(ana_obj, avg_logP)],
                'std_dG': [logPF_to_deltaG(ana_obj, std_logP)],
                "avg_logP": [avg_logP],
                "std_logP": [max(std_logP, 0.35)],
                "is_nan": [res_obj_i.is_nan()],
            }
        )

        if res_obj_i.is_nan():
            df_i["single resloved"] = [np.nan]
            df_i["min_pep logPs"] = [np.nan]

        else:
            df_i["single resloved"] = [res_obj_i.mini_pep.if_single_residue()]
            df_i["min_pep logPs"] = [res_obj_i.clustering_results_logP]

        df_logPF = pd.concat([df_logPF, df_i])

    df_logPF = df_logPF.reset_index(drop=True)

    return df_logPF


df = create_logP_df(ana_apo_1, index_offset)
df.to_csv(f"{results_path}/logP_APO_{today_date}.csv", index=False)

[33]:

# fecth two df and merge
df_apo = create_logP_df(ana_apo_1, index_offset)
df_tri = create_logP_df(ana_tri_1, index_offset)

df = pd.merge(df_apo, df_tri, on="resid", suffixes=("_APO", "_TRI"))

df.to_csv(f"{results_path}/logP_APO_TRI_{today_date}.csv", index=False)

[ ]: