Skip to content

Commit

Permalink
Minor patch
Browse files Browse the repository at this point in the history
  • Loading branch information
JLittlef committed Mar 18, 2024
1 parent 67fc599 commit c911d23
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 28 deletions.
12 changes: 11 additions & 1 deletion bin/mutant_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,23 @@
def get_data(csvname: str, col: str):
data = pd.read_csv(csvname)
clipped_data=pd.DataFrame(data=data, columns=[col])
divided_data = clipped_data.Mutation.str.extract(r'([a-zA-Z]+)([^a-zA-Z]+)([a-zA-Z]+)')
divided_data = clipped_data[col].str.extract(r'([a-zA-Z]+)([^a-zA-Z]+)([a-zA-Z]+)')
divided_cleaned = divided_data.to_string(header=None, index=False)
amino_index = {'G': 'GLY' , 'L': 'LEU', 'R': 'ARG', 'A': 'ALA', 'Y': 'TYR', 'S': 'SER', 'M': 'MET', 'I': 'ILE', 'T': 'THR', 'C': 'CYS', 'V': 'VAL', 'P': 'PRO', 'F': 'PHE', 'W': 'TRP', 'H': 'HIS', 'K': 'LYS', 'D': 'ASP', 'E': 'GLU', 'N': 'ASN', 'Q': 'GLN', ' ': '-'}
new_rep_codes = re.sub(r"[GLRAYSMITCVPFWHKDENQ ]", lambda x: amino_index[x.group(0)], divided_cleaned)
new_rep_cleaned = re.sub("--","-", new_rep_codes)
return new_rep_cleaned

def reformat_data(csvname: str, new_rep_cleaned):
df_newcol = new_rep_cleaned.splitlines()
pre_df = {'name': df_newcol}
df1 = pd.DataFrame(pre_df)
df2 = pd.read_csv(csvname)
df3 = pd.concat([df1, df2['ΔΔG']], axis=1)
stem = csvname.replace(".csv","")
csv_reformat = str(stem + "_reformat.csv")
df3.to_csv(csv_reformat)


def clean_wildtype(pdbname: str, pH: str):
pH_fl = float(pH)
Expand Down
98 changes: 71 additions & 27 deletions bin/openmm-minimise.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
#!/usr/bin/env python3
"""openmm-minimise
#!/usr/bin/env python3#!/usr/bin/env python3
"""openmm-minimise-simultaneous
Minimise the potential energy of the wildtype protein structure and mutated variants, according to AMBER14 potentials
Usage:
openmm-minimise.py [--i=<pdb>] [--max-iter=<iter>] [--tol==<tol>]
openmm-minimise-simultaneous.py [--i=<pdb>] [--no-restraints]
Options:
--i=<pdb> Input the fixed PDB files from the Mutant Maker process
--max-iter=<iter> Maximum number of iterations to arrive at minimised energy
--tol=<tol> Tolerance level of energy change after which the protein's structure is considered minimised
--no-restraints Allow movement of all atoms
"""
import logging
import csv
import pandas as pd
import numpy as np
from docopt import docopt
from openmm.app import *
from openmm import *
from openmm.unit import *
import sys
from sys import stdout
import os

def get_pdb(filename: str):
pdb = PDBFile(filename)
Expand All @@ -38,38 +42,78 @@ def setup_simulation(pdb, system):

def main():
arguments = docopt(__doc__, version='openmm-minimise.py')
isFile = os.path.isfile('data.csv')
if isFile == False:
out = {'name':[], 'min':[], 'lowest3':[], 'lowest5':[], 'lowest10':[], 'lowest20':[]}
inidf = pd.DataFrame(out)
inidf.to_csv('data.csv', index=False)
df = pd.read_csv('data.csv')
pdb = get_pdb(arguments['--i'])
forcefield = setup_forcefield()
system = setup_system(pdb, forcefield, arguments['--no-restraints'])
str_name = arguments['--i']
stem = str_name.replace("_fixed.pdb","")
#csvunfolded = str(stem + "_unfolded.csv")
pdbunfolded = str(stem + "_unfolded.pdb")
pdbfolded = str(stem + "_folded.pdb")
simulation = setup_simulation(pdb, system)
init_state = simulation.context.getState(getEnergy=True, getPositions=True)
logging.info("Starting potential energy = %.9f kcal/mol"
delta_g_list = []
for j in range(1,75):
simulation = setup_simulation(pdb, system)
simulation = setup_simulation(pdb, system)
init_state = simulation.context.getState(getEnergy=True, getPositions=True)
logging.info("Starting potential energy = %.9f kcal/mol"
% init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
simulation.minimizeEnergy(tolerance=0.01)
final_state = simulation.context.getState(getEnergy=True, getPositions=True)
logging.info("Minimised potential energy = %.9f kcal/mol"
#simulation.minimizeEnergy(tolerance=0.0000000001)
simulation.minimizeEnergy(tolerance=2.5)
final_state = simulation.context.getState(getEnergy=True, getPositions=True)
logging.info("Minimised potential energy = %.9f kcal/mol"
% final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
PDBFile.writeFile(
#final_energy = final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)
PDBFile.writeFile(
simulation.topology, final_state.getPositions(), open(pdbunfolded, "w"))
pdbII = get_pdb(pdbunfolded)
forcefield = setup_forcefield()
systemII = setup_system(pdbII, forcefield)
simulationII = setup_simulation(pdbII, systemII)
init_state = simulationII.context.getState(getEnergy=True, getPositions=True)
logging.info("Starting potential energy = %.9f kcal/mol"
% init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
simulationII.minimizeEnergy(tolerance=0.00000001)
final_state = simulationII.context.getState(getEnergy=True, getPositions=True)
logging.info("Minimised potential energy = %.9f kcal/mol"
% final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
PDBFile.writeFile(
simulationII.topology, final_state.getPositions(), open(pdbfolded, "w"))
for i in range(1,2):
pdbII = get_pdb(pdbunfolded)
forcefield = setup_forcefield()
systemII = setup_system(pdbII, forcefield)
simulationII = setup_simulation(pdbII, systemII)
init_state = simulationII.context.getState(getEnergy=True, getPositions=True)
init_pe = init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)
logging.info("Starting potential energy = %.9f kcal/mol"
% init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
simulationII.minimizeEnergy(tolerance=0.0000000000001)
final_state = simulationII.context.getState(getEnergy=True, getPositions=True)
final_pe = final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)
delta_G = init_pe - final_pe
logging.info("Minimised potential energy = %.9f kcal/mol"
% final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
if 3 <= delta_G:
delta_g_list.append(delta_G)
logging.info("Folding free energy = %.9f kcal/mol"
% delta_G)
PDBFile.writeFile(
simulationII.topology, final_state.getPositions(), open(pdbfolded, "w"))
min_G = min(delta_g_list)
ordered_G = sorted(delta_g_list)
lowest_3 = np.average(ordered_G[:3])
lowest_5 = np.average(ordered_G[:5])
lowest_10 = np.average(ordered_G[:10])
lowest_20 = np.average(ordered_G[:20])
logging.info("Mean folding free energy (lowest 20) = %.9f kcal/mol"
% lowest_20)
logging.info("Mean folding free energy (lowest 10) = %.9f kcal/mol"
% lowest_10)
logging.info("Mean folding free energy (lowest 5) = %.9f kcal/mol"
% lowest_5)
logging.info("Mean folding free energy (lowest 3) = %.9f kcal/mol"
% lowest_3)
logging.info("Minimum folding free energy = %.9f kcal/mol"
% min_G)
new_row = {'name': [stem], 'min': [min_G], 'lowest3': [lowest_3], 'lowest5': [lowest_5], 'lowest10': [lowest_10], 'lowest20': [lowest_20]}

df_nr = pd.DataFrame(new_row)
df_nr.to_csv('data.csv', mode='a', index=False)


if __name__ == '__main__':
arguments = docopt(__doc__, version='openmm-minimise.py')
logging.basicConfig(stream=stem + ".out", level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
main()
67 changes: 67 additions & 0 deletions bin/output_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python3
"""output_data
Calculates ΔΔG relative to the wildtype for using minimum, lowest 3, 5, 10 and 20 outputted folding energies then measures Spearman's Ranking relative to experimental/theoretical benchmarks
Usage:
output_data.py [--bench=<csv>] [--test=<csv>]
Options:
--bench=<csv> Read original experimental/theoretical data from csv file
--test=<csv> Read calculated data from openmm-minimise process
"""
import logging
import csv
import pandas as pd
import numpy as np
from docopt import docopt
import sys
from sys import stdout

def get_energy(test_csv: str):
df2 = pd.read_csv(test_csv)
df3 = df2.loc[df['name'] == 'wildtype']
wt_min = df3.iloc[0]['min']
wt_low3 = df3.iloc[0]['lowest3']
wt_low5 = df3.iloc[0]['lowest5']
wt_low10 = df3.iloc[0]['lowest10']
wt_low20 = df3.iloc[0]['lowest20']
df2['min_ΔΔG'] = df2['min'] - wt_min
df2['lowest3_ΔΔG'] = df2['lowest3'] - wt_low3
df2['lowest5_ΔΔG'] = df2['lowest5'] - wt_low5
df2['lowest10_ΔΔG'] = df2['lowest10'] - wt_low10
df2['lowest20_ΔΔG'] = df2['lowest20'] - wt_low20
df2.drop(df2.index[(df2['name'] == 'wildtype')],axis=0,inplace=True)
df2 = df2[['name', 'min_ΔΔG', 'lowest3_ΔΔG', 'lowest5_ΔΔG', 'lowest10_ΔΔG', 'lowest20_ΔΔG']]
return df2

def spearman_rank(bench_csv: str, df2):
df1 = pd.read_csv(bench_csv)
mergeDf = pd.merge(df1, df2, left_on = ['name'], right_on = ['name'])
min_rank = mergeDf['ΔΔG'].corr(mergeDf['min_ΔΔG'], method='spearman')
low3_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest3_ΔΔG'], method='spearman')
low5_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest5_ΔΔG'], method='spearman')
low10_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest10_ΔΔG'], method='spearman')
low20_rank = mergeDf['ΔΔG'].corr(mergeDf['lowest20_ΔΔG'], method='spearman')
logging.info("Min ΔΔG Spearman's Rank = %.9f "
% min_rank)
logging.info("Lowest 3 ΔΔG Spearman's Rank = %.9f "
% low3_rank)
logging.info("Lowest 5 ΔΔG Spearman's Rank = %.9f "
% low5_rank)
logging.info("Lowest 10 ΔΔG Spearman's Rank = %.9f "
% low10_rank)
logging.info("Lowest 20 ΔΔG Spearman's Rank = %.9f "
% low20_rank)
all_ranks = {'min_rank': [min_rank], 'low3_rank': [low3_rank], 'low5_rank': [low5_rank], 'low10_rank': [low10_rank], 'low20_rank': [low20_rank]}
df_sr = pd.DataFrame(all_ranks)
return df_sr

def main():
arguments = docopt(__doc__, version='output_data.py')
df2 = get_energy(arguments['--test'])
df2.to_csv('data_ΔΔG.csv', mode='w', indexr=False)
df_sr = spearman_rank(arguments['--bench'], df2)
df_sr.to_csv('data_ΔΔG-spearman.csv', mode='w', indexr=False)

if __name__ == '__main__':
arguments = docopt(__doc__, version='openmm-minimise.py')
logging.getLogger().setLevel(logging.INFO)
main()
16 changes: 16 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ process pdbfixer-mutants {
path inpdb
output:
path '*_fixed.pdb', emit: fixed_pdbs
path '*_reformat.csv', emit: csv_reformat
shell:
"""
mutant_maker.py --incsv $incsv --from-col ${params.csv.col} --in-pdb $inpdb ${params.mutant.maker.args}
Expand All @@ -19,17 +20,32 @@ process openmm-minimise {
output:
path '*_unfolded.pdb', emit: unfolded_pdbs
path '*_folded.pdb', emit: folded_pdbs
path 'data.csv', emit: data
shell:
"""
openmm-minimise.py --i $fixed_pdbs
"""
}

process output_data {
input:
path benchcsv
path testcsv
output:
path 'data_ΔΔG.csv', emit: data_ΔΔG
path 'data_ΔΔG-spearman.csv', emit: spearman
shell:
"""
output_data.py --bench $benchcsv --test= $testcsv
"""
}


workflow {
inpath_ch = channel.fromPath("${params.inputFile}")
incsv_ch = channel.fromPath("${params.inputCsv}")
pdbfixer-mutants(inpath_ch, incsv_ch)
openmm-minimise(pdbfixer-mutants.out.fixed_pdbs)
output_data(pdbfixer-mutants.out.csv_reformat, openmm-minimise.out.data)
}

7 changes: 7 additions & 0 deletions testdata/tableExport-2JIEIII.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
ID,"Protein Name",Mutation,PDB,T,pH,Method,Measure,ΔΔG,ΔTm,Reference
13158,"Beta-glucosidase B",N223Y,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-5.73,3.79,"PMID: 32258884"
13159,"Beta-glucosidase B",E167A,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-4.25,8.28,"PMID: 32258884"
13160,"Beta-glucosidase B",N223R,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.98,1.57,"PMID: 32258884"
13161,"Beta-glucosidase B",N223G,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.46,2.25,"PMID: 32258884"
13162,"Beta-glucosidase B",T221A,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.19,2.06,"PMID: 32258884"
13163,"Beta-glucosidase B",C170A,2JIE,298.15,7.5,Thermal,"PTS, Fluorescence",-3.00,7.35,"PMID: 32258884"

0 comments on commit c911d23

Please sign in to comment.