diff --git a/bin/mutant_maker.py b/bin/mutant_maker.py index 02019b1..8fc5ea1 100644 --- a/bin/mutant_maker.py +++ b/bin/mutant_maker.py @@ -24,13 +24,23 @@ def get_data(csvname: str, col: str): data = pd.read_csv(csvname) clipped_data=pd.DataFrame(data=data, columns=[col]) - divided_data = clipped_data.Mutation.str.extract(r'([a-zA-Z]+)([^a-zA-Z]+)([a-zA-Z]+)') + divided_data = clipped_data[col].str.extract(r'([a-zA-Z]+)([^a-zA-Z]+)([a-zA-Z]+)') divided_cleaned = divided_data.to_string(header=None, index=False) amino_index = {'G': 'GLY' , 'L': 'LEU', 'R': 'ARG', 'A': 'ALA', 'Y': 'TYR', 'S': 'SER', 'M': 'MET', 'I': 'ILE', 'T': 'THR', 'C': 'CYS', 'V': 'VAL', 'P': 'PRO', 'F': 'PHE', 'W': 'TRP', 'H': 'HIS', 'K': 'LYS', 'D': 'ASP', 'E': 'GLU', 'N': 'ASN', 'Q': 'GLN', ' ': '-'} new_rep_codes = re.sub(r"[GLRAYSMITCVPFWHKDENQ ]", lambda x: amino_index[x.group(0)], divided_cleaned) new_rep_cleaned = re.sub("--","-", new_rep_codes) return new_rep_cleaned +def reformat_data(csvname: str, new_rep_cleaned): + df_newcol = new_rep_cleaned.splitlines() + pre_df = {'name': df_newcol} + df1 = pd.DataFrame(pre_df) + df2 = pd.read_csv(csvname) + df3 = pd.concat([df1, df2['ΔΔG']], axis=1) + stem = csvname.replace(".csv","") + csv_reformat = str(stem + "_reformat.csv") + df3.to_csv(csv_reformat) + def clean_wildtype(pdbname: str, pH: str): pH_fl = float(pH) diff --git a/bin/openmm-minimise.py b/bin/openmm-minimise.py index 0a6663b..17c874b 100644 --- a/bin/openmm-minimise.py +++ b/bin/openmm-minimise.py @@ -1,22 +1,26 @@ -#!/usr/bin/env python3 -"""openmm-minimise +#!/usr/bin/env python3#!/usr/bin/env python3 +"""openmm-minimise-simultaneous Minimise the potential energy of the wildtype protein structure and mutated variants, according to AMBER14 potentials Usage: -openmm-minimise.py [--i=] [--max-iter=] [--tol==] +openmm-minimise-simultaneous.py [--i=] [--no-restraints] + Options: --i= Input the fixed PDB files from the Mutant Maker process --max-iter= Maximum number of iterations to arrive at minimised energy ---tol= Tolerance level of energy change after which the protein's structure is considered minimised - +--no-restraints Allow movement of all atoms """ import logging +import csv +import pandas as pd +import numpy as np from docopt import docopt from openmm.app import * from openmm import * from openmm.unit import * import sys from sys import stdout +import os def get_pdb(filename: str): pdb = PDBFile(filename) @@ -38,38 +42,78 @@ def setup_simulation(pdb, system): def main(): arguments = docopt(__doc__, version='openmm-minimise.py') + isFile = os.path.isfile('data.csv') + if isFile == False: + out = {'name':[], 'min':[], 'lowest3':[], 'lowest5':[], 'lowest10':[], 'lowest20':[]} + inidf = pd.DataFrame(out) + inidf.to_csv('data.csv', index=False) + df = pd.read_csv('data.csv') pdb = get_pdb(arguments['--i']) forcefield = setup_forcefield() system = setup_system(pdb, forcefield, arguments['--no-restraints']) str_name = arguments['--i'] stem = str_name.replace("_fixed.pdb","") - #csvunfolded = str(stem + "_unfolded.csv") pdbunfolded = str(stem + "_unfolded.pdb") pdbfolded = str(stem + "_folded.pdb") - simulation = setup_simulation(pdb, system) - init_state = simulation.context.getState(getEnergy=True, getPositions=True) - logging.info("Starting potential energy = %.9f kcal/mol" + delta_g_list = [] + for j in range(1,75): + simulation = setup_simulation(pdb, system) + simulation = setup_simulation(pdb, system) + init_state = simulation.context.getState(getEnergy=True, getPositions=True) + logging.info("Starting potential energy = %.9f kcal/mol" % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) - simulation.minimizeEnergy(tolerance=0.01) - final_state = simulation.context.getState(getEnergy=True, getPositions=True) - logging.info("Minimised potential energy = %.9f kcal/mol" + #simulation.minimizeEnergy(tolerance=0.0000000001) + simulation.minimizeEnergy(tolerance=2.5) + final_state = simulation.context.getState(getEnergy=True, getPositions=True) + logging.info("Minimised potential energy = %.9f kcal/mol" % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) - PDBFile.writeFile( + #final_energy = final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole) + PDBFile.writeFile( simulation.topology, final_state.getPositions(), open(pdbunfolded, "w")) - pdbII = get_pdb(pdbunfolded) - forcefield = setup_forcefield() - systemII = setup_system(pdbII, forcefield) - simulationII = setup_simulation(pdbII, systemII) - init_state = simulationII.context.getState(getEnergy=True, getPositions=True) - logging.info("Starting potential energy = %.9f kcal/mol" - % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) - simulationII.minimizeEnergy(tolerance=0.00000001) - final_state = simulationII.context.getState(getEnergy=True, getPositions=True) - logging.info("Minimised potential energy = %.9f kcal/mol" - % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) - PDBFile.writeFile( - simulationII.topology, final_state.getPositions(), open(pdbfolded, "w")) + for i in range(1,2): + pdbII = get_pdb(pdbunfolded) + forcefield = setup_forcefield() + systemII = setup_system(pdbII, forcefield) + simulationII = setup_simulation(pdbII, systemII) + init_state = simulationII.context.getState(getEnergy=True, getPositions=True) + init_pe = init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole) + logging.info("Starting potential energy = %.9f kcal/mol" + % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) + simulationII.minimizeEnergy(tolerance=0.0000000000001) + final_state = simulationII.context.getState(getEnergy=True, getPositions=True) + final_pe = final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole) + delta_G = init_pe - final_pe + logging.info("Minimised potential energy = %.9f kcal/mol" + % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) + if 3 <= delta_G: + delta_g_list.append(delta_G) + logging.info("Folding free energy = %.9f kcal/mol" + % delta_G) + PDBFile.writeFile( + simulationII.topology, final_state.getPositions(), open(pdbfolded, "w")) + min_G = min(delta_g_list) + ordered_G = sorted(delta_g_list) + lowest_3 = np.average(ordered_G[:3]) + lowest_5 = np.average(ordered_G[:5]) + lowest_10 = np.average(ordered_G[:10]) + lowest_20 = np.average(ordered_G[:20]) + logging.info("Mean folding free energy (lowest 20) = %.9f kcal/mol" + % lowest_20) + logging.info("Mean folding free energy (lowest 10) = %.9f kcal/mol" + % lowest_10) + logging.info("Mean folding free energy (lowest 5) = %.9f kcal/mol" + % lowest_5) + logging.info("Mean folding free energy (lowest 3) = %.9f kcal/mol" + % lowest_3) + logging.info("Minimum folding free energy = %.9f kcal/mol" + % min_G) + new_row = {'name': [stem], 'min': [min_G], 'lowest3': [lowest_3], 'lowest5': [lowest_5], 'lowest10': [lowest_10], 'lowest20': [lowest_20]} + + df_nr = pd.DataFrame(new_row) + df_nr.to_csv('data.csv', mode='a', index=False) + + if __name__ == '__main__': arguments = docopt(__doc__, version='openmm-minimise.py') - logging.basicConfig(stream=stem + ".out", level=logging.INFO) + logging.getLogger().setLevel(logging.INFO) main() diff --git a/bin/output_data.py b/bin/output_data.py new file mode 100755 index 0000000..d899d0e --- /dev/null +++ b/bin/output_data.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""output_data +Calculates ΔΔG relative to the wildtype for using minimum, lowest 3, 5, 10 and 20 outputted folding energies then measures Spearman's Ranking relative to experimental/theoretical benchmarks +Usage: +output_data.py [--bench=] [--test=] +Options: +--bench= Read original experimental/theoretical data from csv file +--test= Read calculated data from openmm-minimise process +""" +import logging +import csv +import pandas as pd +import numpy as np +from docopt import docopt +import sys +from sys import stdout + +def get_energy(test_csv: str): + df2 = pd.read_csv(test_csv) + df3 = df2.loc[df['name'] == 'wildtype'] + wt_min = df3.iloc[0]['min'] + wt_low3 = df3.iloc[0]['lowest3'] + wt_low5 = df3.iloc[0]['lowest5'] + wt_low10 = df3.iloc[0]['lowest10'] + wt_low20 = df3.iloc[0]['lowest20'] + df2['min_ΔΔG'] = df2['min'] - wt_min + df2['lowest3_ΔΔG'] = df2['lowest3'] - wt_low3 + df2['lowest5_ΔΔG'] = df2['lowest5'] - wt_low5 + df2['lowest10_ΔΔG'] = df2['lowest10'] - wt_low10 + df2['lowest20_ΔΔG'] = df2['lowest20'] - wt_low20 + df2.drop(df2.index[(df2['name'] == 'wildtype')],axis=0,inplace=True) + df2 = df2[['name', 'min_ΔΔG', 'lowest3_ΔΔG', 'lowest5_ΔΔG', 'lowest10_ΔΔG', 'lowest20_ΔΔG']] + return df2 + +def spearman_rank(bench_csv: str, df2): + df1 = pd.read_csv(bench_csv) + mergeDf = pd.merge(df1, df2, left_on = ['name'], right_on = ['name']) + min_rank = mergeDf['ΔΔG'].corr(mergeDf['min_ΔΔG'], method='spearman') + low3_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest3_ΔΔG'], method='spearman') + low5_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest5_ΔΔG'], method='spearman') + low10_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest10_ΔΔG'], method='spearman') + low20_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest20_ΔΔG'], method='spearman') + logging.info("Min ΔΔG Spearman's Rank = %.9f " + % min_rank) + logging.info("Lowest 3 ΔΔG Spearman's Rank = %.9f " + % low3_rank) + logging.info("Lowest 5 ΔΔG Spearman's Rank = %.9f " + % low5_rank) + logging.info("Lowest 10 ΔΔG Spearman's Rank = %.9f " + % low10_rank) + logging.info("Lowest 20 ΔΔG Spearman's Rank = %.9f " + % low20_rank) + all_ranks = {'min_rank': [min_rank], 'low3_rank': [low3_rank], 'low5_rank': [low5_rank], 'low10_rank': [low10_rank], 'low20_rank': [low20_rank]} + df_sr = pd.DataFrame(all_ranks) + return df_sr + +def main(): + arguments = docopt(__doc__, version='output_data.py') + df2 = get_energy(arguments['--test']) + df2.to_csv('data_ΔΔG.csv', mode='w', indexr=False) + df_sr = spearman_rank(arguments['--bench'], df2) + df_sr.to_csv('data_ΔΔG-spearman.csv', mode='w', indexr=False) + +if __name__ == '__main__': + arguments = docopt(__doc__, version='openmm-minimise.py') + logging.getLogger().setLevel(logging.INFO) + main() diff --git a/main.nf b/main.nf index 4b5c67b..50b8f2f 100644 --- a/main.nf +++ b/main.nf @@ -7,6 +7,7 @@ process pdbfixer-mutants { path inpdb output: path '*_fixed.pdb', emit: fixed_pdbs + path '*_reformat.csv', emit: csv_reformat shell: """ mutant_maker.py --incsv $incsv --from-col ${params.csv.col} --in-pdb $inpdb ${params.mutant.maker.args} @@ -19,17 +20,32 @@ process openmm-minimise { output: path '*_unfolded.pdb', emit: unfolded_pdbs path '*_folded.pdb', emit: folded_pdbs + path 'data.csv', emit: data shell: """ openmm-minimise.py --i $fixed_pdbs """ } +process output_data { + input: + path benchcsv + path testcsv + output: + path 'data_ΔΔG.csv', emit: data_ΔΔG + path 'data_ΔΔG-spearman.csv', emit: spearman + shell: + """ + output_data.py --bench $benchcsv --test= $testcsv + """ +} + workflow { inpath_ch = channel.fromPath("${params.inputFile}") incsv_ch = channel.fromPath("${params.inputCsv}") pdbfixer-mutants(inpath_ch, incsv_ch) openmm-minimise(pdbfixer-mutants.out.fixed_pdbs) + output_data(pdbfixer-mutants.out.csv_reformat, openmm-minimise.out.data) } diff --git a/testdata/tableExport-2JIEIII.csv b/testdata/tableExport-2JIEIII.csv new file mode 100644 index 0000000..c92bb30 --- /dev/null +++ b/testdata/tableExport-2JIEIII.csv @@ -0,0 +1,7 @@ +ID,"Protein Name",Mutation,PDB,T,pH,Method,Measure,ΔΔG,ΔTm,Reference +13158,"Beta-glucosidase B",N223Y,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-5.73,3.79,"PMID: 32258884" +13159,"Beta-glucosidase B",E167A,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-4.25,8.28,"PMID: 32258884" +13160,"Beta-glucosidase B",N223R,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.98,1.57,"PMID: 32258884" +13161,"Beta-glucosidase B",N223G,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.46,2.25,"PMID: 32258884" +13162,"Beta-glucosidase B",T221A,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.19,2.06,"PMID: 32258884" +13163,"Beta-glucosidase B",C170A,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.00,7.35,"PMID: 32258884"