diff --git a/bin/mutant_maker.py b/bin/mutant_maker.py index ba31811..02019b1 100644 --- a/bin/mutant_maker.py +++ b/bin/mutant_maker.py @@ -1,16 +1,70 @@ #!/usr/bin/env python3 +"""mutant_maker + +Fix PDB files, adding missing residues, atoms and hydrogens, then create mutations in desired locations +Usage: +mutant_maker.py [--incsv=] [--from-col=] [--in-pdb=] [--chain=] [--pH=] + +Options: +--incsv= Read data from csv file +--from-col= Select column title containing mutations +--in-pdb= PDB file of protein wildtype +--chain= Select chain upon which to perform mutation +--pH= Set pH of the protein +""" + +from docopt import docopt +import pandas as pd +import re from openmm.app import PDBFile import sys from sys import stdout from pdbfixer import PDBFixer -pdb = PDBFixer(sys.argv[1]) -pdb.addMutations(mutationChain='A', mutationResidues='19', newResidues='ALA') -pdb.findMissingResidues() -pdb.findNonstandardResidues() -pdb.replaceNonstandardResidues() -pdb.removeHeterogens(True) -pdb.findMissingAtoms() -pdb.addMissingAtoms() -pdb.addMissingHydrogens(7.0) -PDBFile.writeFile(pdb.topology, pdb.positions, open('modified_protein.pdb', 'w')) \ No newline at end of file +def get_data(csvname: str, col: str): + data = pd.read_csv(csvname) + clipped_data=pd.DataFrame(data=data, columns=[col]) + divided_data = clipped_data.Mutation.str.extract(r'([a-zA-Z]+)([^a-zA-Z]+)([a-zA-Z]+)') + divided_cleaned = divided_data.to_string(header=None, index=False) + amino_index = {'G': 'GLY' , 'L': 'LEU', 'R': 'ARG', 'A': 'ALA', 'Y': 'TYR', 'S': 'SER', 'M': 'MET', 'I': 'ILE', 'T': 'THR', 'C': 'CYS', 'V': 'VAL', 'P': 'PRO', 'F': 'PHE', 'W': 'TRP', 'H': 'HIS', 'K': 'LYS', 'D': 'ASP', 'E': 'GLU', 'N': 'ASN', 'Q': 'GLN', ' ': '-'} + new_rep_codes = re.sub(r"[GLRAYSMITCVPFWHKDENQ ]", lambda x: amino_index[x.group(0)], divided_cleaned) + new_rep_cleaned = re.sub("--","-", new_rep_codes) + return new_rep_cleaned + + +def clean_wildtype(pdbname: str, pH: str): + pH_fl = float(pH) + pdb = PDBFixer(pdbname) + pdb.findMissingResidues() + pdb.findNonstandardResidues() + pdb.replaceNonstandardResidues() + pdb.removeHeterogens(False) + pdb.findMissingAtoms() + pdb.addMissingAtoms() + pdb.addMissingHydrogens(pH_fl) + PDBFile.writeFile(pdb.topology, pdb.positions, open("wildtype_fixed.pdb", 'w')) + + +def create_mutants(pdbname: str, new_rep_cleaned, chain: str, pH: str): + pH_fl = float(pH) + for mutant in new_rep_cleaned.splitlines(): + mutpdb = PDBFixer(pdbname) + mutpdb.applyMutations([mutant], chain) + mutpdb.findMissingResidues() + mutpdb.findNonstandardResidues() + mutpdb.replaceNonstandardResidues() + mutpdb.removeHeterogens(False) + mutpdb.findMissingAtoms() + mutpdb.addMissingAtoms() + mutpdb.addMissingHydrogens(pH_fl) + PDBFile.writeFile(pdb.topology, pdb.positions, open(mutant + "_fixed.pdb", 'w')) + + +def main(): + arguments = docopt(__doc__, version='mutant_maker.py') + new_rep_cleaned = get_data(arguments['--incsv'], arguments['--from-col']) + clean_wildtype(arguments['--inpdb'], arguments['--pH']) + create_mutants(arguments['--inpdb'], new_rep_cleaned, arguments['--chain'], arguments['--pH']) + +if __name__ == '__main__': + main() diff --git a/bin/openmm-minimise.py b/bin/openmm-minimise.py new file mode 100644 index 0000000..0a6663b --- /dev/null +++ b/bin/openmm-minimise.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +"""openmm-minimise + +Minimise the potential energy of the wildtype protein structure and mutated variants, according to AMBER14 potentials +Usage: +openmm-minimise.py [--i=] [--max-iter=] [--tol==] +Options: +--i= Input the fixed PDB files from the Mutant Maker process +--max-iter= Maximum number of iterations to arrive at minimised energy +--tol= Tolerance level of energy change after which the protein's structure is considered minimised + +""" +import logging +from docopt import docopt +from openmm.app import * +from openmm import * +from openmm.unit import * +import sys +from sys import stdout + +def get_pdb(filename: str): + pdb = PDBFile(filename) + return pdb + +def setup_forcefield(): + forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml') + return forcefield + +def setup_system(pdb, forcefield): + system = forcefield.createSystem(pdb.topology, nonbondedMethod=NoCutoff) + return system + +def setup_simulation(pdb, system): + integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.001*picoseconds) + simulation = Simulation(pdb.topology, system, integrator) + simulation.context.setPositions(pdb.positions) + return simulation + +def main(): + arguments = docopt(__doc__, version='openmm-minimise.py') + pdb = get_pdb(arguments['--i']) + forcefield = setup_forcefield() + system = setup_system(pdb, forcefield, arguments['--no-restraints']) + str_name = arguments['--i'] + stem = str_name.replace("_fixed.pdb","") + #csvunfolded = str(stem + "_unfolded.csv") + pdbunfolded = str(stem + "_unfolded.pdb") + pdbfolded = str(stem + "_folded.pdb") + simulation = setup_simulation(pdb, system) + init_state = simulation.context.getState(getEnergy=True, getPositions=True) + logging.info("Starting potential energy = %.9f kcal/mol" + % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) + simulation.minimizeEnergy(tolerance=0.01) + final_state = simulation.context.getState(getEnergy=True, getPositions=True) + logging.info("Minimised potential energy = %.9f kcal/mol" + % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) + PDBFile.writeFile( + simulation.topology, final_state.getPositions(), open(pdbunfolded, "w")) + pdbII = get_pdb(pdbunfolded) + forcefield = setup_forcefield() + systemII = setup_system(pdbII, forcefield) + simulationII = setup_simulation(pdbII, systemII) + init_state = simulationII.context.getState(getEnergy=True, getPositions=True) + logging.info("Starting potential energy = %.9f kcal/mol" + % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) + simulationII.minimizeEnergy(tolerance=0.00000001) + final_state = simulationII.context.getState(getEnergy=True, getPositions=True) + logging.info("Minimised potential energy = %.9f kcal/mol" + % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)) + PDBFile.writeFile( + simulationII.topology, final_state.getPositions(), open(pdbfolded, "w")) +if __name__ == '__main__': + arguments = docopt(__doc__, version='openmm-minimise.py') + logging.basicConfig(stream=stem + ".out", level=logging.INFO) + main() diff --git a/bin/openmm-runner.py b/bin/openmm-runner.py index 4060e6c..9e3eabd 100755 --- a/bin/openmm-runner.py +++ b/bin/openmm-runner.py @@ -1,46 +1,77 @@ #!/usr/bin/env python3 +"""openmm-md + +Minimise the potential energy of the wildtype protein structure and mutated variants, according to AMBER14 potentials +Usage: +openmm-md.py [--i=] [--solv-mol=] [--temp==] [--pres=] [--nvt=] [--npt=] +Options: +--i= Input the fixed PDB files from the Mutant Maker process +--solv-mol= Number of solvent molecules (default: solvation with H2O) +--temp= Temperature of MD thermostat +--pres= Pressure of MD barostat (NPT MD only) +--nvt= Number of NVT trajectory steps to initiate MD (each step = 1fs) +--npt= Number of NPT steps in principal MD trajectory (each step = 1fs) +--rep= Reporting rate wherefrom PDB coordinates and therodynamic conditions are outputted to trajectory (PDB) and log (CSV) files respectively +""" +import logging +from docopt import docopt from openmm.app import * from openmm import * from openmm.unit import * import sys from sys import stdout -from pdbfixer import PDBFixer - -#pdb = PDBFile(sys.argv[1]) -pdb = PDBFixer(sys.argv[1]) -pdb.applyMutations(["SER-19-ALA"], "A") -pdb.findMissingResidues() -pdb.findNonstandardResidues() -pdb.replaceNonstandardResidues() -pdb.removeHeterogens(True) -pdb.findMissingAtoms() -pdb.addMissingAtoms() -pdb.addMissingHydrogens(7.0) -forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml') -modeller = Modeller(pdb.topology, pdb.positions) -#modeller.deleteWater() -#residues=modeller.addHydrogens(forcefield) -solvmol=int(sys.argv[4]) -modeller.addSolvent(forcefield, numAdded=solvmol) -system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0*nanometer, constraints=HBonds) -temp=int(sys.argv[2]) -integrator = LangevinMiddleIntegrator(temp*kelvin, 1/picosecond, 0.001*picoseconds) -simulation = Simulation(modeller.topology, system, integrator) -simulation.context.setPositions(modeller.positions) -print("Minimizing energy") -simulation.minimizeEnergy() -print("Running NVT") -reprate=int(sys.argv[7]) -simulation.reporters.append(PDBReporter(sys.argv[8], reprate)) -simulation.reporters.append(StateDataReporter(stdout, reprate, step=True, - potentialEnergy=True, temperature=True, volume=True)) -simulation.reporters.append(StateDataReporter(sys.argv[9], reprate, step=True, + + +def get_pdb(filename: str): + pdb = PDBFile(filename) + return pdb + +def setup_forcefield(): + forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml') + return forcefield + +def setup_modeller(pdb): + modeller = Modeller(pdb.topology, pdb.positions) + return modeller + +def setup_system(modeller, forcefield, solvmol: str): + mol=int(solvmol) + modeller.addSolvent(forcefield, numAdded=mol) + system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0*nanometer, constraints=HBonds) + return system + +def setup_simulation(modeller, system, temp: str): + temp_fl = int(temp) + integrator = LangevinMiddleIntegrator(temp_fl*kelvin, 1/picosecond, 0.001*picoseconds) + simulation = Simulation(modeller.topology, system, integrator) + simulation.context.setPositions(modeller.positions) + return simulation + +def main(): + arguments = docopt(__doc__, version='openmm-md.py') + pdb = get_pdb(arguments['--i']) + forcefield = setup_forcefield() + modeller = setup_modeller(pdb) + system = setup_system(modeller, forcefield, arguments['--solv-mol']) + simulation = setup_simulation(modeller, system, arguments['--temp']) + print("Minimizing energy") + simulation.minimizeEnergy() + print("Running NVT") + reprate = int(arguments['--rep']) + nvt = int(arguments['--nvt']) + npt = int(arguments['--npt']) + pres = int(arguments['--pres']) + temp = int(arguments['--temp']) + str_name = arguments['--i'] + stem = str_name.replace("_opt.pdb", "") + simulation.reporters.append(PDBReporter(stem + "_MD.pdb", reprate)) + simulation.reporters.append(StateDataReporter(stdout, reprate, step=True, potentialEnergy=True, temperature=True, volume=True)) + simulation.reporters.append(StateDataReporter(stem + "_log.csv", reprate, step=True, potentialEnergy=True, temperature=True, volume=True)) -nvtsteps=int(sys.argv[5]) -pres=int(sys.argv[3]) -simulation.step(nvtsteps) -system.addForce(MonteCarloBarostat(pres*bar, temp*kelvin)) -simulation.context.reinitialize(preserveState=True) -print("Running NPT") -nptsteps=int(sys.argv[6]) -simulation.step(nptsteps) + simulation.step(nvt) + system.addForce(MonteCarloBarostat(pres*bar, temp*kelvin)) + simulation.context.reinitialize(preserveState=True) + print("Running NPT") + simulation.step(npt) +if __name__ == '__main__': + main() diff --git a/main.nf b/main.nf index 5214804..4b5c67b 100644 --- a/main.nf +++ b/main.nf @@ -1,50 +1,35 @@ // enabling nextflow DSL v2 nextflow.enable.dsl=2 -process openmm { - - publishDir "${params.resultsDir}", mode: 'copy' - +process pdbfixer-mutants { input: + path incsv path inpdb - val temp - val pres - val solvmol - val nptrun - val nvtrun - val reprate output: - path 'traj.pdb' - path 'md_log.txt', emit: md_log - -shell: + path '*_fixed.pdb', emit: fixed_pdbs + shell: """ - openmm-runner.py $inpdb $temp $pres $solvmol $nptrun $nvtrun $reprate traj.pdb md_log.txt + mutant_maker.py --incsv $incsv --from-col ${params.csv.col} --in-pdb $inpdb ${params.mutant.maker.args} """ - } -process hydro_plot { +process openmm-minimise { input: - path md_log - val skipsteps - + path fixed_pdbs + output: + path '*_unfolded.pdb', emit: unfolded_pdbs + path '*_folded.pdb', emit: folded_pdbs shell: - """ - plot.py $md_log $skipsteps - """ + """ + openmm-minimise.py --i $fixed_pdbs + """ } + workflow { inpath_ch = channel.fromPath("${params.inputFile}") - temp = Channel.value(params.temp) - pres = Channel.value(params.pres) - solvmol = Channel.value(params.solvmol) - nvtrun = Channel.value(params.nvtrun) - nptrun = Channel.value(params.nptrun) - reprate = Channel.value(params.reprate) - skipsteps = Channel.value(params.skipsteps) - openmm(inpath_ch, temp, pres, solvmol, nvtrun, nptrun, reprate) - hydro_plot(openmm.out.md_log, skipsteps) + incsv_ch = channel.fromPath("${params.inputCsv}") + pdbfixer-mutants(inpath_ch, incsv_ch) + openmm-minimise(pdbfixer-mutants.out.fixed_pdbs) } diff --git a/nextflow.config b/nextflow.config index 90ce0a8..de828f3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,7 +2,9 @@ params { inputFile = "./testdata/2JIE.pdb" resultsDir = "./results/" -} + inputCsv = "./testdata/tableExport-2JIEIII.csv" + csv.col = "Mutation" + mutant.maker.args = "--chain A --pH 7.5" // include basic process configuration options includeConfig 'conf/base.config'