From 25152dfeea4200c4a527c49f9cc5487dff86b0bd Mon Sep 17 00:00:00 2001
From: Calvin <calvinpieters@gmail.com>
Date: Sat, 21 Dec 2024 23:02:12 +0200
Subject: [PATCH] Server Running CREST

---
 arc/job/adapters/ts/heuristics.py | 357 +++++++++++++++++++-----------
 arc/settings/submit.py            |  48 ++++
 arc/species/converter.py          |   2 +
 3 files changed, 275 insertions(+), 132 deletions(-)

diff --git a/arc/job/adapters/ts/heuristics.py b/arc/job/adapters/ts/heuristics.py
index 6d2118cc42..69cf3da7c0 100644
--- a/arc/job/adapters/ts/heuristics.py
+++ b/arc/job/adapters/ts/heuristics.py
@@ -16,6 +16,7 @@
 """
 
 import datetime
+import time
 import itertools
 import math
 import subprocess
@@ -33,17 +34,17 @@
 from arkane.statmech import is_linear
 
 from arc.common import almost_equal_coords, get_logger, is_angle_linear, key_by_val
-from arc.imports import settings
+from arc.imports import settings, submit_scripts
 from arc.job.adapter import JobAdapter
 from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family
 from arc.job.factory import register_job_adapter
+from arc.job.local import submit_job, check_job_status
 from arc.plotter import save_geo
 from arc.species.converter import (compare_zmats, 
                                    relocate_zmat_dummy_atoms_to_the_end, 
                                    zmat_from_xyz, 
                                    zmat_to_xyz, 
                                    str_to_xyz, 
-                                   xyz_to_str, 
                                    str_to_str, 
                                    xyz_to_str, 
                                    xyz_to_dmat)
@@ -60,7 +61,7 @@
     CREST_PATH = settings['CREST_PATH']
     CREST_ENV_PATH = settings['CREST_ENV_PATH']
     HAS_CREST = True
-
+    SERVERS = settings['servers']
 except KeyError:
     HAS_CREST = False
 
@@ -999,7 +1000,7 @@ def h_abstraction(arc_reaction: 'ARCReaction',
         # Don't modify dihedrals for an attacking H (or other linear radical) at a linear angle, C ~ A -- H1 - H2 -- H.
         dihedral_increment = 360
 
-    for rmg_reaction in rmg_reactions:
+    for i, rmg_reaction in enumerate(rmg_reactions):
         rmg_reactant_mol = rmg_reaction.reactants[int(reactants_reversed)].molecule[0]
         rmg_product_mol = rmg_reaction.products[int(not products_reversed)].molecule[0]
         h1 = rmg_reactant_mol.atoms.index([atom for atom in rmg_reactant_mol.atoms if atom.label == '*2'][0])
@@ -1024,7 +1025,7 @@ def h_abstraction(arc_reaction: 'ARCReaction',
             d2_d3_product = [(None, d3) for d3 in d3_values]
         else:
             d2_d3_product = [(None, None)]
-
+        rmg_xyz_guesses = list()
         zmats = list()
         for d2, d3 in d2_d3_product:
             xyz_guess = None
@@ -1058,10 +1059,10 @@ def h_abstraction(arc_reaction: 'ARCReaction',
                     # This TS is unique, and has no atom collisions.
                     zmats.append(zmat_guess)
                     xyz_guesses.append(xyz_guess)
-
-    if xyz_guesses:
-        for i, xyz_guess_crest in enumerate(xyz_guesses):
-            # 1. Check if dict
+                    rmg_xyz_guesses.append(xyz_guess)
+        crest_paths = list()
+        if rmg_xyz_guesses and HAS_CREST:
+            xyz_guess_crest = rmg_xyz_guesses[0]
             if isinstance(xyz_guess_crest, dict):
                 df_dmat = convert_xyz_to_df(xyz_guess_crest)
             elif isinstance(xyz_guess_crest, str):
@@ -1077,13 +1078,18 @@ def h_abstraction(arc_reaction: 'ARCReaction',
                 a = extract_digits(h_abs_atoms_dict['A'])
                 h = extract_digits(h_abs_atoms_dict['H'])
                 b = extract_digits(h_abs_atoms_dict['B'])
-                xyz_guess = crest_ts_conformer_search(xyz_guess_crest, a, h, b, path=path, xyz_crest_int=i)
-                if xyz_guess is not None:
-                    logger.info('Successfully completed crest conformer search:'
-                                f' {xyz_to_str(xyz_guess)}')
-                    xyz_guesses.append(xyz_guess)
-            except (ValueError or KeyError) as e:
-                logger.error(f'Could not determine the H abstraction atoms, got:\n{e}')
+                crest_path = crest_ts_conformer_search(xyz_guess_crest, a, h, b, path=path)
+                crest_paths.append(crest_path)
+            except (ValueError, KeyError) as e:
+                logger.error(f"Could not determine the H abstraction atoms, got:\n{e}")
+
+        if crest_paths:
+            crest_jobs = submit_crest_jobs(crest_paths)
+            monitor_crest_jobs(crest_jobs)  # Keep checking job statuses until complete
+            process_completed_jobs(crest_jobs, xyz_guesses)
+        else:
+            logger.error("No CREST paths found")
+
 
     return xyz_guesses
 
@@ -1099,33 +1105,23 @@ def crest_ts_conformer_search(xyz_guess: dict, a_atom: int, h_atom: int, b_atom:
     # TODO: Pass the atom A - H - B for constraints
     
     """
-    path = os.path.join(path, 'crest')
+    path = os.path.join(path, f'crest_{xyz_crest_int}')
     if not os.path.exists(path):
         os.makedirs(path)
 
     # Write coords to coords.ref file
     symbols = xyz_guess['symbols']
-    coords = xyz_guess['coords']
-    angstrom_to_bohr = 1.8897259886
-
-
-    with open(os.path.join(path, 'coords.ref'), 'w') as f:
-        # Format - XYZ in Bohr:
-        #
-        # $coord
-        # x y z atom1
-        # x y z atom2
-        # ...
-        # $end
-
-        f.write('$coord\n')
-        for i, (x, y, z) in enumerate(coords):
-            # Convert to Bohr from Angstrom
-            x_bohr = x * angstrom_to_bohr
-            y_bohr = y * angstrom_to_bohr
-            z_bohr = z * angstrom_to_bohr
-            f.write(f'    {x_bohr:>18.14f} {y_bohr:>18.14f} {z_bohr:>18.14f}   {symbols[i]}\n')
-        f.write('$end\n')
+
+    converted_coords = str_to_str(xyz_str=xyz_guess['coords'], reverse_atoms=True, convert_to="bohr")
+    coords_ref_content = f"$coord\n{converted_coords}\n$end\n"
+    coords_ref_path = os.path.join(path, 'coords.ref')
+
+    try:
+        with open(coords_ref_path, 'w') as f:
+            f.write(coords_ref_content)
+    except IOError as e:
+        print(f"Failed to write to {coords_ref_path}: {e}")
+        return None
     
     # Get count of atoms - remember to add 1 to all atom numbers
     num_atoms = len(symbols)
@@ -1150,36 +1146,133 @@ def crest_ts_conformer_search(xyz_guess: dict, a_atom: int, h_atom: int, b_atom:
         f.write('$metadyn\n')
         f.write(f'  atoms: {", ".join(map(str, list_of_atoms_numbers_not_participating_in_reaction))}\n')
         f.write('$end\n')
-    # Run CREST
-    # Prepare the command
+
     commands = [
         f'{CREST_PATH}',
+        f' -T {SERVERS["local"].get("cpus", 8)}',
         f'{path}/coords.ref',
         f'--cinp {path}/constraints.inp',
         '--noreftopo'
     ]
     command = ' '.join(commands)
     command = f"{CREST_ENV_PATH} && {command}" if CREST_ENV_PATH else command
-    # Run the command using Popen
-    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=path, executable='/bin/bash')    
-    stdout, stderr = process.communicate()
-
-    if process.returncode == 0:
-        print("Command completed successfully.")
-        with open(os.path.join(path, 'crest_best.xyz'), 'r') as f:
-            content = f.read()
-        
-        xyz_guess = str_to_xyz(content)
-        # Delete unnecessary files
-        for file in os.listdir(path):
-            if file not in {'coords.ref', 'constraints.inp', 'crest_best.xyz'}:
-                os.remove(os.path.join(path, file))
-        return xyz_guess
-    else:
-        print(f"Command failed with return code {process.returncode}")
-        print(f"Standard Output: {stdout.decode()}")
-        print(f"Standard Error: {stderr.decode()}")
-        return None
+
+    if SERVERS.get('local') is not None:
+        if SERVERS['local']['cluster_soft'].lower() in ['condor', 'htcondor']:
+            # Write job submission scripts
+
+            sub_job = submit_scripts['local']['crest']
+            format_params = {
+                "name": f"crest_{xyz_crest_int}",
+                "cpus": SERVERS['local'].get('cpus', 8),
+                "memory": SERVERS['local'].get('memory', 32.0) * 1024,
+            }
+            sub_job = sub_job.format(**format_params)
+
+            with open(os.path.join(path, settings['submit_filenames']['HTCondor']), 'w') as f:
+                f.write(sub_job)
+
+            # Write the crest job
+            crest_job = submit_scripts['local']['crest_job']
+            format_params = {
+                "commands": command,
+            }
+            crest_job = crest_job.format(**format_params)
+
+            with open(os.path.join(path, 'job.sh'), 'w') as f:
+                f.write(crest_job)
+        elif SERVERS['local']['cluster_soft'].lower() == 'pbs':
+            # Write job submission scripts
+            sub_job = submit_scripts['local']['crest']
+            format_params = {
+                "queue": "alon_q",
+                "name": f"crest_{xyz_crest_int}",
+                "cpus": SERVERS['local'].get('cpus', 8),
+                "memory": SERVERS['local'].get('memory', 32.0) * 1024,
+                "commands": command,
+            }
+            sub_job = sub_job.format(**format_params)
+
+            with open(os.path.join(path, settings['submit_filenames']['PBS']), 'w') as f:
+                f.write(sub_job)
+
+    return path
+
+def submit_crest_jobs(crest_paths: List[str]) -> None:
+    """
+    Submit CREST jobs to the server.
+
+    Args:
+        crest_paths (List[str]): List of paths to the CREST directories.
+
+    Returns:
+        dict: A dictionary containing job IDs as keys and their statuses as values.
+    """
+    crest_jobs = {}
+    for crest_path in crest_paths:
+        job_status, job_id = submit_job(path=crest_path)
+        crest_jobs[job_id] = {"path": crest_path, "status": job_status}
+    return crest_jobs
+
+
+def monitor_crest_jobs(crest_jobs: dict, check_interval: int = 300) -> None:
+    """
+    Monitor CREST jobs until they are complete.
+
+    Args:
+        crest_jobs (dict): Dictionary containing job information (job ID, path, and status).
+        check_interval (int): Time interval (in seconds) to wait between status checks.
+    """
+    while True:
+        all_done = True
+        for job_id, job_info in crest_jobs.items():
+            if job_info["status"] not in ["done", "failed"]:
+                try:
+                    job_info["status"] = check_job_status(job_id)  # Update job status
+                except Exception as e:
+                    logger.error(f"Error checking job status for job {job_id}: {e}")
+                    job_info["status"] = "failed"
+                if job_info["status"] not in ["done", "failed"]:
+                    all_done = False
+        if all_done:
+            break
+        time.sleep(min(check_interval, 100))
+
+def process_completed_jobs(crest_jobs: dict, xyz_guesses: list) -> None:
+    """
+    Process the completed CREST jobs and update XYZ guesses.
+
+    Args:
+        crest_jobs (dict): Dictionary containing job information.
+        xyz_guesses (list): List to store the resulting XYZ guesses.
+    """
+    for job_id, job_info in crest_jobs.items():
+        crest_path = job_info["path"]
+        if job_info["status"] == "done":
+            crest_best_path = os.path.join(crest_path, "crest_best.xyz")
+            if os.path.exists(crest_best_path):
+                with open(crest_best_path, "r") as f:
+                    content = f.read()
+                xyz_guess = str_to_xyz(content)
+                xyz_guesses.append(xyz_guess)
+            else:
+                logger.error(f"crest_best.xyz not found in {crest_path}")
+        elif job_info["status"] == "failed":
+            logger.error(f"CREST job failed for {crest_path}")
+
+def extract_digits(s: str) -> int:
+    """
+    Extract the first integer from a string
+
+    Args:
+        s (str): The string to extract the integer from
+
+    Returns:
+        int: The first integer in the string
+
+    """
+    return int(re.sub(r"[^\d]", "", s))
+
 
 def convert_xyz_to_df(xyz: dict) -> pd.DataFrame:
     """
@@ -1208,85 +1301,85 @@ def get_h_abs_atoms(dataframe: pd.DataFrame) -> dict:
 
     Returns:
         dict: The hydrogen atom and the two heavy atoms. The keys are 'H', 'A', 'B'
-
     """
-    # Ensure there are at least 3 atoms in the TS
-    if len(dataframe) < 3:
-        raise ValueError("TS must contain at least 3 atoms.")
-    if len(dataframe) == 3 and dataframe.index.str.startswith("H").sum() == 2:
-        # Identify the heavy atom
-        heavy_atom = dataframe.index[~dataframe.index.str.startswith("H")][0]  # Should be the only heavy atom
-        hydrogen_atoms = dataframe.index[dataframe.index.str.startswith("H")]  # List of hydrogen atoms
-
-        # Get distances from the heavy atom to the two hydrogens
-        distances_to_hydrogens = dataframe.loc[heavy_atom, hydrogen_atoms]
-
-        # Select the hydrogen with the smallest distance to the heavy atom as `H`
-        hydrogen_with_min_distance = distances_to_hydrogens.idxmin()
-
-        # The other hydrogen becomes `B`
-        other_hydrogen = hydrogen_atoms[hydrogen_atoms != hydrogen_with_min_distance][0]
-
-        return {"H": hydrogen_with_min_distance, "A": heavy_atom, "B": other_hydrogen}
-
-    elif len(dataframe) == 4 and dataframe.index.str.startswith("H").sum() == 3:
-        # Identify the heavy atom
-        heavy_atom = dataframe.index[~dataframe.index.str.startswith("H")][0]  # Should be the only heavy atom
-        hydrogen_atoms = dataframe.index[dataframe.index.str.startswith("H")]  # List of hydrogen atoms
-
-        # Remove hydrogens from columns and the heavy atom from rows
-        filtered_df = dataframe.loc[hydrogen_atoms, [heavy_atom]]
-
-        # Sort the distances from the heavy atom to all hydrogens
-        sorted_distances = filtered_df[heavy_atom].sort_values()
-
-        # Select the hydrogen with the second furthest distance
-        hydrogen_with_max_distance = sorted_distances.index[-2]
-
-        # Reset the DataFrame back to the original to find the other hydrogen (`B`)
-        remaining_hydrogens = hydrogen_atoms[hydrogen_atoms != hydrogen_with_max_distance]
-        filtered_hydrogens_df = dataframe.loc[[hydrogen_with_max_distance], remaining_hydrogens]
-
-        # Find the hydrogen closest to the selected hydrogen (`H`)
-        closest_hydrogen = filtered_hydrogens_df.idxmin(axis=1).iloc[0]
-
-        return {"H": hydrogen_with_max_distance, "A": heavy_atom, "B": closest_hydrogen}
 
+    closest_atoms = {}
+    for index, row in dataframe.iterrows():
+
+        row[index] = np.inf
+        closest = row.nsmallest(2).index.tolist()
+        closest_atoms[index] = closest
+
+    hydrogen_keys = [key for key in dataframe.index if key.startswith("H")]
+    condition_occurrences = []
+
+    for hydrogen_key in hydrogen_keys:
+        atom_neighbours = closest_atoms[hydrogen_key]
+        is_heavy_present = any(
+            atom for atom in closest_atoms if not atom.startswith("H")
+        )
+        if_hydrogen_present = any(
+            atom
+            for atom in closest_atoms
+            if atom.startswith("H") and atom != hydrogen_key
+        )
+
+        if is_heavy_present and if_hydrogen_present:
+            # Store the details of this occurrence
+            condition_occurrences.append(
+                {"H": hydrogen_key, "A": atom_neighbours[0], "B": atom_neighbours[1]}
+            )
+
+    # Check if the condition was met
+    if condition_occurrences:
+        if len(condition_occurrences) > 1:
+            # Store distances to decide which occurrence to use
+            occurrence_distances = []
+            for occurrence in condition_occurrences:
+                # Calculate the sum of distances to the two heavy atoms
+                hydrogen_key = f"{occurrence['H']}"
+                heavy_atoms = [f"{occurrence['A']}", f"{occurrence['B']}"]
+                try:
+                    distances = dataframe.loc[hydrogen_key, heavy_atoms].sum()
+                    occurrence_distances.append((occurrence, distances))
+                except KeyError as e:
+                    print(f"Error accessing distances for occurrence {occurrence}: {e}")
+
+            # Select the occurrence with the smallest distance
+            best_occurrence = min(occurrence_distances, key=lambda x: x[1])[0]
+            return {
+                "H": extract_digits(best_occurrence["H"]),
+                "A": extract_digits(best_occurrence["A"]),
+                "B": extract_digits(best_occurrence["B"]),
+            }
     else:
 
-        # Filter the DataFrame for hydrogen rows and non-hydrogen columns
-        hydrogen_rows = dataframe.index[dataframe.index.str.startswith("H")]
-        heavy_atom_columns = dataframe.columns[~dataframe.columns.str.startswith("H")]
-
-        filtered_df = dataframe.loc[hydrogen_rows, heavy_atom_columns]
+        # Check the all the hydrogen atoms, and see the closest two heavy atoms and aggregate their distances to determine which Hyodrogen atom has the lowest distance aggregate
+        min_distance = np.inf
+        selected_hydrogen = None
+        selected_heavy_atoms = None
 
-        # Find the hydrogen atom with the smallest bond distance to a heavy atom
-        min_distances = filtered_df.min(axis=1)
-        min_distances = min_distances[min_distances <= 2.0]
-        hydrogen_with_min_distance = min_distances.idxmax()
-        min_distance_column = filtered_df.loc[hydrogen_with_min_distance].idxmin()
+        for hydrogen_key in hydrogen_keys:
+            atom_neighbours = closest_atoms[hydrogen_key]
+            heavy_atoms = [atom for atom in atom_neighbours if not atom.startswith("H")]
 
-        # Handle cases with multiple heavy atoms
-        remaining_columns = dataframe.columns[
-            ~dataframe.columns.isin([hydrogen_with_min_distance, min_distance_column])
-        ]
-        remaining_df = dataframe.loc[[hydrogen_with_min_distance], remaining_columns]
-        second_closest_atom = remaining_df.idxmin(axis=1).iloc[0]
-
-    return {"H": hydrogen_with_min_distance, "A": min_distance_column, "B": second_closest_atom}
-
-def extract_digits(s: str) -> int:
-    """
-    Extract the first integer from a string
-
-    Args:
-        s (str): The string to extract the integer from
-
-    Returns:
-        int: The first integer in the string
+            if len(heavy_atoms) < 2:
+                continue
 
-    """
-    return int(re.sub(r'[^\d]', '', s))
+            distances = dataframe.loc[hydrogen_key, heavy_atoms[:2]].sum()
+            if distances < min_distance:
+                min_distance = distances
+                selected_hydrogen = hydrogen_key
+                selected_heavy_atoms = heavy_atoms
+
+        if selected_hydrogen:
+            return {
+                "H": extract_digits(selected_hydrogen),
+                "A": extract_digits(selected_heavy_atoms[0]),
+                "B": extract_digits(selected_heavy_atoms[1]),
+            }
+        else:
+            raise ValueError("No valid hydrogen atom found.")
 
 
 register_job_adapter('heuristics', HeuristicsAdapter)
diff --git a/arc/settings/submit.py b/arc/settings/submit.py
index 58bf1577a5..fb0647f808 100644
--- a/arc/settings/submit.py
+++ b/arc/settings/submit.py
@@ -289,6 +289,38 @@
     },
 
     'atlas': {
+        'crest': """Universe      = vanilla
+
++JobName      = "{name}"
+
+log           = job.log
+output        = out.txt
+error         = err.txt
+
+getenv        = True
+
+should_transfer_files = no
+
+executable = job.sh
+
+request_cpus  = {cpus}
+request_memory = {memory}MB
+
+queue
+
+""",
+        'crest_job': """#!/bin/bash
+
+touch initial_time
+
+{commands}
+
+touch final_time
+
+# Remove all files except for crest_best.xyz, coords.ref constraints.inp
+rm -vfr !(crest_best.xyz|coords.ref|constraints.inp)
+
+""",
         # Atlas uses HTCondor, see docs here: https://htcondor.readthedocs.io/en/latest/
         # Gaussian 09
         'gaussian': """Universe      = vanilla
@@ -854,6 +886,22 @@
 touch final_time
 
     """,
+        'crest': """#!/bin/bash -l
+#PBS -q {queue}
+#PBS -N {name}
+#PBS -l select=1:ncpus={cpus}:mem={memory}:mpiprocs={cpus}
+#PBS -o out.txt
+#PBS -e err.txt
+
+touch initial_time
+
+{commands}
+
+touch final_time
+
+rm -vrf !(crest_best.xyz|coords.ref|constraints.inp)
+
+        """,
     },
  'server3': {
         'gaussian': """#!/bin/bash -l
diff --git a/arc/species/converter.py b/arc/species/converter.py
index e43fcc4bf4..c7345bfda6 100644
--- a/arc/species/converter.py
+++ b/arc/species/converter.py
@@ -69,6 +69,8 @@ def str_to_str(xyz_str: str,
     Returns: str
         The converted string xyz format.
     """
+    if isinstance(xyz_str, list):
+        xyz_str = '\n'.join(xyz_str)
     if not isinstance(xyz_str, str):
         raise ConverterError(f'Expected a string input, got {type(xyz_str)}')
     if project_directory is not None and os.path.isfile(os.path.join(project_directory, xyz_str)):