From e46aa167114c8065499f0298691ce66d6b9ce8f1 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Tue, 31 Oct 2023 13:41:59 -0400
Subject: [PATCH 01/28] added options to generate qcschemas for inputs

---
 peslearn/datagen/configuration_space.py | 85 +++++++++++++++++++++++--
 peslearn/input_processor.py             | 16 +++--
 setup.py                                |  2 +-
 3 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/peslearn/datagen/configuration_space.py b/peslearn/datagen/configuration_space.py
index 698a136..59a5b07 100644
--- a/peslearn/datagen/configuration_space.py
+++ b/peslearn/datagen/configuration_space.py
@@ -235,9 +235,9 @@ def add_redundancies_back(self):
                         # add duplicate to duplicate_interatomics column if it has not been found
                         if idm not in row[-1]:
                             row[-1].append(idm) 
+            
 
-
-    def generate_PES(self, template_obj):
+    def generate_templates(self, template_obj):
         # generate the full geometry set or the removed redundancy geometry set?
         self.generate_geometries()
         if self.input_obj.keywords['remove_redundancy'].lower().strip() == 'true':
@@ -251,7 +251,7 @@ def generate_PES(self, template_obj):
             df = self.unique_geometries 
         elif self.input_obj.keywords['remove_redundancy'].lower().strip() == 'false':
             df = self.all_geometries
-          
+        
         pes_dir_name = self.input_obj.keywords['pes_dir_name']
         if not os.path.exists("./" +  pes_dir_name):
             os.mkdir("./" +  pes_dir_name)
@@ -291,7 +291,84 @@ def generate_PES(self, template_obj):
 
         os.chdir("../")
         print("Your PES inputs are now generated. Run the jobs in the {} directory and then parse.".format(pes_dir_name))
-        
+
+    def generate_schema(self):
+
+        self.generate_geometries()
+        if self.input_obj.keywords['remove_redundancy'].lower().strip() == 'true':
+            print("Removing symmetry-redundant geometries...", end='  ')
+            self.remove_redundancies()
+
+            if self.input_obj.keywords['grid_reduction']:
+                self.filter_configurations()
+            if self.input_obj.keywords['remember_redundancy'].lower().strip() == 'true':
+                self.add_redundancies_back()
+            df = self.unique_geometries 
+        elif self.input_obj.keywords['remove_redundancy'].lower().strip() == 'false':
+            df = self.all_geometries  
+
+        pes_dir_name = self.input_obj.keywords['pes_dir_name']
+        if not os.path.exists("./" +  pes_dir_name):
+            os.mkdir("./" +  pes_dir_name)
+        os.chdir("./" +  pes_dir_name)    
+
+        for i, cart_array in enumerate(df['cartesians'], start=1):
+            schema_inp = {}
+            symbols = []
+            geom = []
+            xyz = ''
+            if self.input_obj.keywords['schema_method'] == None:
+                raise Exception("'schema_method' cannot be blank, please enter a method.")
+            if self.input_obj.keywords['schema_basis'] == None:
+                raise Exception("'schema_basis' cannot be blank, please enter a basis.")
+            method = self.input_obj.keywords['schema_method']
+            basis = self.input_obj.keywords['schema_basis']
+
+            for j in range(len(self.mol.std_order_atoms)):
+                symbols += self.mol.std_order_atom_labels[j]
+                geom += [cart_array[j][0], cart_array[j][1], cart_array[j][2]]
+            schema_inp['symbols'] = symbols
+            schema_inp['geometry'] = geom
+            
+            if self.input_obj.keywords['schema_keywords'] != None:
+                keywords = self.input_obj.keywords['schema_keywords']
+
+            driver = self.input_obj.keywords['schema_driver']
+            if driver not in ['energy','hessian','gradient','properties']:
+                raise Exception("{} is not a valid option for 'schema_driver', entry must be 'energy', 'hessian', 'gradient', 'properties'".format(driver))
+            
+            prog = self.input_obj.keywords['schema_prog']
+            if prog == None:
+                raise Exception("'schema_prog' must be defined, please enter a program.")
+            
+            if not os.path.exists(str(i)):
+                os.mkdir(str(i))
+            os.chdir(str(i))
+
+            with open('input.dat', 'w') as f:
+                f.write("import qcengine as qcng\nimport qcelemental as qcel\n\n")
+                f.write('molecule = qcel.models.Molecule.from_data("""\n')
+                for j in range(len(self.mol.std_order_atoms)):
+                    xyz += "%s %10.10f %10.10f %10.10f\n" % (self.mol.std_order_atom_labels[j], cart_array[j][0], cart_array[j][1], cart_array[j][2])
+                f.write(xyz)
+                f.write('""")\n\n')
+                f.write("driver = '%s'\nmodel = {'method':'%s', 'basis':'%s'}\nkeywords = %s\nprog = '%s'\n\n" % (driver, method, basis, keywords, prog))
+                f.write("atomic_inp = qcel.models.AtomicInput(molecule=molecule, driver=driver, model=model, keywords=keywords)\n\n")
+                f.write("atomic_res = qcng.compute(atomic_inp, prog)\n\n")
+                f.write("with open('output.dat','w') as f:\n\tf.write('{\\n')\n\tfor key, value in atomic_res.dict():\n\t\tf.write('{%s:%s}\\n', (key, value))\n\tf.write('}')")
+
+            os.chdir("../")
+
+        print("Your PES inputs are now generated. Run the jobs in the {} directory and then parse.".format(pes_dir_name))
+
+    def generate_PES(self, template_obj=None, schema_gen='false'):
+        if self.input_obj.keywords['schema_generate'].lower().strip() == 'true' or schema_gen == 'true':
+            self.generate_schema()
+        elif template_obj == None and self.input_obj.keywords['schema_generate'].lower().strip() == 'false' and schema_gen == 'false':
+            raise Exception("template_obj not found, check your path.")
+        else:
+            self.generate_templates(template_obj)
+            
 
     def old_remove_redundancies(self):
         """
diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index 53ecb42..667a050 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -59,14 +59,22 @@ def get_keywords(self):
                            'sort_pes': 'true',               #'false'
                            'sampling': 'structure_based',    # 'structure_based','sobol', 'smart_random', 'random', 'energy_ordered'
                            'n_low_energy_train': 0,          # any int
-                           'training_points': None,            # any int
-                           'validation_points': None,          # any int
+                           'training_points': None,          # any int
+                           'validation_points': None,        # any int
                            'hp_maxit': 20,                   # any int
                            'rseed': None,                    # any int
                            'gp_ard': 'true',                 # 'true', 'false'. 'opt' treats as hyperparameter
-                           'nas_trial_layers': None,          # List of lists e.g. [[10,], [10,10,10], [50,50]]
+                           'nas_trial_layers': None,         # List of lists e.g. [[10,], [10,10,10], [50,50]]
                            'nn_precision': 32,               # neural network floating point precision 32 or 64
-                           'hp_opt': 'true'}                 # 'false'
+                           'hp_opt': 'true',                 # 'false'
+                           'schema_generate':'false',        # 'true'
+                           'units':'bohr',                   # 'angstrom'
+                           'schema_method' : None,
+                           'schema_basis' : None,
+                           'schema_driver' : 'energy',
+                           'schema_keywords' : None,
+                           'schema_prog' : None
+                            }
 
         for k in string_keywords:
             match = re.search(k+"\s*=\s*(.+)", self.full_string)
diff --git a/setup.py b/setup.py
index 4828bd7..18880f0 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
         license='BSD-3C',
         packages=setuptools.find_packages(),
         install_requires=[
-            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1'
+            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1', 'qcelemental'
         ],
         extras_require={
             'docs': [

From e972ca465e6995d23f5dfbbde101d8f78a5ed187 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Tue, 31 Oct 2023 14:14:36 -0400
Subject: [PATCH 02/28] cleaned code, added comments

---
 peslearn/datagen/configuration_space.py | 32 ++++++++++---------------
 peslearn/input_processor.py             | 10 ++++----
 2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/peslearn/datagen/configuration_space.py b/peslearn/datagen/configuration_space.py
index 59a5b07..86e7948 100644
--- a/peslearn/datagen/configuration_space.py
+++ b/peslearn/datagen/configuration_space.py
@@ -313,30 +313,21 @@ def generate_schema(self):
         os.chdir("./" +  pes_dir_name)    
 
         for i, cart_array in enumerate(df['cartesians'], start=1):
-            schema_inp = {}
-            symbols = []
-            geom = []
             xyz = ''
-            if self.input_obj.keywords['schema_method'] == None:
-                raise Exception("'schema_method' cannot be blank, please enter a method.")
-            if self.input_obj.keywords['schema_basis'] == None:
-                raise Exception("'schema_basis' cannot be blank, please enter a basis.")
+            # check the contents of the input string for keywords necessary for schema generation
+            driver = self.input_obj.keywords['schema_driver']
+            if driver not in ['energy','hessian','gradient','properties']:
+                raise Exception("{} is not a valid option for 'schema_driver', entry must be 'energy', 'hessian', 'gradient', 'properties'".format(driver))            
             method = self.input_obj.keywords['schema_method']
+            if method == None:
+                raise Exception("'schema_method' cannot be blank, please enter a method.")
             basis = self.input_obj.keywords['schema_basis']
-
-            for j in range(len(self.mol.std_order_atoms)):
-                symbols += self.mol.std_order_atom_labels[j]
-                geom += [cart_array[j][0], cart_array[j][1], cart_array[j][2]]
-            schema_inp['symbols'] = symbols
-            schema_inp['geometry'] = geom
-            
-            if self.input_obj.keywords['schema_keywords'] != None:
+            if basis == None:
+                raise Exception("'schema_basis' cannot be blank, please enter a basis.")
+            if self.input_obj.keywords['schema_keywords'] == None:
+                keywords = '{}'
+            else:
                 keywords = self.input_obj.keywords['schema_keywords']
-
-            driver = self.input_obj.keywords['schema_driver']
-            if driver not in ['energy','hessian','gradient','properties']:
-                raise Exception("{} is not a valid option for 'schema_driver', entry must be 'energy', 'hessian', 'gradient', 'properties'".format(driver))
-            
             prog = self.input_obj.keywords['schema_prog']
             if prog == None:
                 raise Exception("'schema_prog' must be defined, please enter a program.")
@@ -345,6 +336,7 @@ def generate_schema(self):
                 os.mkdir(str(i))
             os.chdir(str(i))
 
+            # write the input files to run with qcengine
             with open('input.dat', 'w') as f:
                 f.write("import qcengine as qcng\nimport qcelemental as qcel\n\n")
                 f.write('molecule = qcel.models.Molecule.from_data("""\n')
diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index 667a050..358dc4b 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -69,11 +69,11 @@ def get_keywords(self):
                            'hp_opt': 'true',                 # 'false'
                            'schema_generate':'false',        # 'true'
                            'units':'bohr',                   # 'angstrom'
-                           'schema_method' : None,
-                           'schema_basis' : None,
-                           'schema_driver' : 'energy',
-                           'schema_keywords' : None,
-                           'schema_prog' : None
+                           'schema_method' : None,           # any method interpretable by QC software of choice
+                           'schema_basis' : None,            # any basis interperetable by QC software of choice
+                           'schema_driver' : 'energy',       # 'hessian', 'gradient', 'properties'
+                           'schema_keywords' : None,         # any keywords interperetable by QC software of choice, python dictionary in quotes
+                           'schema_prog' : None              # any program supported by QCEngine
                             }
 
         for k in string_keywords:

From 048de439dfe3936f9e33ff5f544b194e6f24f50a Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Tue, 31 Oct 2023 14:42:20 -0400
Subject: [PATCH 03/28] added dependencies

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 18880f0..0210023 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
         license='BSD-3C',
         packages=setuptools.find_packages(),
         install_requires=[
-            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1', 'qcelemental'
+            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1', 'qcelemental>=0.27.1', 'qcengine>=0.28.0'
         ],
         extras_require={
             'docs': [

From fedc4c13ff579f64435e2f55e6f028ed6b1865de Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 Nov 2023 11:08:32 -0400
Subject: [PATCH 04/28] fixed dependencies

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0210023..bf450d0 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
         license='BSD-3C',
         packages=setuptools.find_packages(),
         install_requires=[
-            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1', 'qcelemental>=0.27.1', 'qcengine>=0.28.0'
+            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1', 'qcelemental>=0.27.1', 'qcengine>=0.26.0'
         ],
         extras_require={
             'docs': [

From befd508e191b4d8ba87ca7ab4c88d7e4dcb2edb3 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 Nov 2023 11:30:12 -0400
Subject: [PATCH 05/28] fixed input generator for schemas

---
 peslearn/datagen/configuration_space.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/peslearn/datagen/configuration_space.py b/peslearn/datagen/configuration_space.py
index 86e7948..019eebe 100644
--- a/peslearn/datagen/configuration_space.py
+++ b/peslearn/datagen/configuration_space.py
@@ -337,8 +337,8 @@ def generate_schema(self):
             os.chdir(str(i))
 
             # write the input files to run with qcengine
-            with open('input.dat', 'w') as f:
-                f.write("import qcengine as qcng\nimport qcelemental as qcel\n\n")
+            with open('input.py', 'w') as f:
+                f.write("import qcengine as qcng\nimport qcelemental as qcel\nimport pprint\n\n")
                 f.write('molecule = qcel.models.Molecule.from_data("""\n')
                 for j in range(len(self.mol.std_order_atoms)):
                     xyz += "%s %10.10f %10.10f %10.10f\n" % (self.mol.std_order_atom_labels[j], cart_array[j][0], cart_array[j][1], cart_array[j][2])
@@ -347,8 +347,7 @@ def generate_schema(self):
                 f.write("driver = '%s'\nmodel = {'method':'%s', 'basis':'%s'}\nkeywords = %s\nprog = '%s'\n\n" % (driver, method, basis, keywords, prog))
                 f.write("atomic_inp = qcel.models.AtomicInput(molecule=molecule, driver=driver, model=model, keywords=keywords)\n\n")
                 f.write("atomic_res = qcng.compute(atomic_inp, prog)\n\n")
-                f.write("with open('output.dat','w') as f:\n\tf.write('{\\n')\n\tfor key, value in atomic_res.dict():\n\t\tf.write('{%s:%s}\\n', (key, value))\n\tf.write('}')")
-
+                f.write("with open('output.dat','w') as f:\n\tpprint.pprint(atomic_res.dict(), f)")
             os.chdir("../")
 
         print("Your PES inputs are now generated. Run the jobs in the {} directory and then parse.".format(pes_dir_name))

From 9d6dfc95b80c8c10a70ff6ba38124c2fd6f4a118 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 Nov 2023 12:34:25 -0400
Subject: [PATCH 06/28] fixed bohr/angstrom disagreement

---
 peslearn/datagen/configuration_space.py | 14 +++++++++++---
 peslearn/input_processor.py             |  4 ++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/peslearn/datagen/configuration_space.py b/peslearn/datagen/configuration_space.py
index 019eebe..0ace3b3 100644
--- a/peslearn/datagen/configuration_space.py
+++ b/peslearn/datagen/configuration_space.py
@@ -331,7 +331,10 @@ def generate_schema(self):
             prog = self.input_obj.keywords['schema_prog']
             if prog == None:
                 raise Exception("'schema_prog' must be defined, please enter a program.")
-            
+            units = self.input_obj.keywords['schema_units']
+            if units == 'bohr':
+                from .. import constants
+    
             if not os.path.exists(str(i)):
                 os.mkdir(str(i))
             os.chdir(str(i))
@@ -341,9 +344,14 @@ def generate_schema(self):
                 f.write("import qcengine as qcng\nimport qcelemental as qcel\nimport pprint\n\n")
                 f.write('molecule = qcel.models.Molecule.from_data("""\n')
                 for j in range(len(self.mol.std_order_atoms)):
-                    xyz += "%s %10.10f %10.10f %10.10f\n" % (self.mol.std_order_atom_labels[j], cart_array[j][0], cart_array[j][1], cart_array[j][2])
+                    if units == 'bohr':
+                        xyz += "%s %10.10f %10.10f %10.10f\n" % (self.mol.std_order_atom_labels[j], cart_array[j][0] * constants.bohr2angstroms, cart_array[j][1] * constants.bohr2angstroms, cart_array[j][2] * constants.bohr2angstroms)
+                    elif units == 'angstrom':
+                        xyz += "%s %10.10f %10.10f %10.10f\n" % (self.mol.std_order_atom_labels[j], cart_array[j][0], cart_array[j][1], cart_array[j][2])
                 f.write(xyz)
-                f.write('""")\n\n')
+                f.write('""",\nfix_com=True,\nfix_orientation=True)\n')
+                if units == 'bohr':
+                    f.write('# The above geometry is in Angstroms for QCEngine input purposes.\n\n')
                 f.write("driver = '%s'\nmodel = {'method':'%s', 'basis':'%s'}\nkeywords = %s\nprog = '%s'\n\n" % (driver, method, basis, keywords, prog))
                 f.write("atomic_inp = qcel.models.AtomicInput(molecule=molecule, driver=driver, model=model, keywords=keywords)\n\n")
                 f.write("atomic_res = qcng.compute(atomic_inp, prog)\n\n")
diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index 358dc4b..3e5f113 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -67,8 +67,8 @@ def get_keywords(self):
                            'nas_trial_layers': None,         # List of lists e.g. [[10,], [10,10,10], [50,50]]
                            'nn_precision': 32,               # neural network floating point precision 32 or 64
                            'hp_opt': 'true',                 # 'false'
-                           'schema_generate':'false',        # 'true'
-                           'units':'bohr',                   # 'angstrom'
+                           'schema_generate' : 'false',      # 'true'
+                           'schema_units' : 'bohr',          # 'angstrom'
                            'schema_method' : None,           # any method interpretable by QC software of choice
                            'schema_basis' : None,            # any basis interperetable by QC software of choice
                            'schema_driver' : 'energy',       # 'hessian', 'gradient', 'properties'

From cee4c34ae1c91bcac379e1b610165bc29bcdf43f Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 Nov 2023 14:10:21 -0400
Subject: [PATCH 07/28] added schema keywords

---
 3_Keywords/keywords.md | 44 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/3_Keywords/keywords.md b/3_Keywords/keywords.md
index 759c443..e878d20 100644
--- a/3_Keywords/keywords.md
+++ b/3_Keywords/keywords.md
@@ -132,12 +132,54 @@ If this keyword is not used, the software will ask what you want to do.
     
 
 * `sort_pes`  
-  **Description:** When parsing to produce a dataset, sort the energies in increasing order  
+  **Description:** When parsing to produce a dataset, sort the energies in increasing order.  
     * **Type**: bool
     * **Default**: true
     * **Possible values**: true, false
 
+* `schema_generate`  
+  **Description:** Generate input files that will run with QCEngine to produce QCSchema outputs.  
+    * **Type**: bool
+    * **Default**: false
+    * **Possible values**: true, false
+
+* `schema_units`  
+  **Description:** The units of the provided Z-Matrix input. QCEngine expects input units of Angstroms so Bohr will be converted. 
+    * **Type**: string
+    * **Default**: angstrom
+    * **Possible values**: bohr, angstrom
+
+* `schema_method`  
+  **Description:** Any method that can be interpreted by the quantum chemical software of choice. 
+    * **Type**: string
+    * **Default**: None
+    * **Possible values**: any string, e.g. 'hf', 'ccsd', etc.
+
+* `schema_basis`  
+  **Description:** Any basis that can be interpreted by the quantum chemical software of choice. 
+    * **Type**: string
+    * **Default**: None
+    * **Possible values**: any string, e.g. 'sto-3g', 'cc-pvdz', etc.
+
+* `schema_driver`  
+  **Description:** The type of computation for QCEngine to run.
+    * **Type**: string
+    * **Default**: 'energy'
+    * **Possible values**: 'energy', 'hessian', 'gradient', 'properties'
+
+* `schema_keywords`  
+  **Description:** A python dictionary surrounded by quotes containing keywords to be used by the quantum chemical software of choice.
+    * **Type**: dict, surrounted by quotes
+    * **Default**: None
+    * **Possible values**: any dict surrounded by quotes e.g. "{e_convergence': '1e-4', 'maxiter': '30'}"
+
+* `schema_prog`  
+  **Description:** The quantum chemical program to run the desired computation, must be a program supported by QCEngine. 
+    * **Type**: string
+    * **Default**: None
+    * **Possible values**: any string e.g. 'psi4'
     
+
 ## Machine Learning Keywords
 
 * `ml_model`  

From 74929dd33ac07224a5489743f0c0612fda57db6e Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 Nov 2023 14:10:57 -0400
Subject: [PATCH 08/28] fixed typos

---
 peslearn/input_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index 3e5f113..38b21bd 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -68,7 +68,7 @@ def get_keywords(self):
                            'nn_precision': 32,               # neural network floating point precision 32 or 64
                            'hp_opt': 'true',                 # 'false'
                            'schema_generate' : 'false',      # 'true'
-                           'schema_units' : 'bohr',          # 'angstrom'
+                           'schema_units' : 'angstrom',      # 'bohr'
                            'schema_method' : None,           # any method interpretable by QC software of choice
                            'schema_basis' : None,            # any basis interperetable by QC software of choice
                            'schema_driver' : 'energy',       # 'hessian', 'gradient', 'properties'

From 9010e8925a46d6b4518faa2d46c82d9936e54e4b Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Thu, 2 Nov 2023 11:43:37 -0400
Subject: [PATCH 09/28] updated for use with pandas 2.0

---
 peslearn/utils/parsing_helper.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/peslearn/utils/parsing_helper.py b/peslearn/utils/parsing_helper.py
index 6542e0c..0d5c6cb 100644
--- a/peslearn/utils/parsing_helper.py
+++ b/peslearn/utils/parsing_helper.py
@@ -23,6 +23,11 @@ def extract_energy(input_obj, output_obj):
                 return energy
         else:
             raise Exception("\n energy_regex value not assigned in input. Please add a regular expression which captures the energy value, e.g. energy_regex = 'RHF Final Energy: \s+(-\d+\.\d+)'")
+        
+    elif input_obj.keywords['energy'] == 'schema':
+        def extract_energy(input_obj, output_obj):
+            energy = output_obj.extract_energy_from_schema()
+            return energy
     
     # define gradient extraction routine based on user keywords
     if input_obj.keywords['gradient'] == 'cclib':
@@ -44,6 +49,14 @@ def extract_gradient(output_obj, h=header, f=footer, g=grad_line_regex):
         else:
             raise Exception("For regular expression gradient extraction, gradient_header, gradient_footer, and gradient_line string identifiers are required to isolate the cartesian gradient block. See documentation for details")   
 
+    elif input_obj.keywords['gradient'] == 'schema':
+        # add function to find gradient from schema
+        return None 
+    
+    #add function to parse hessian from schema
+
+    #add function to parse properties from schema
+
     # parse original internals or interatomics?
     if input_obj.keywords['pes_format'] == 'zmat':
         data = pd.DataFrame(index=None, columns = mol.unique_geom_parameters)
@@ -84,7 +97,7 @@ def extract_gradient(output_obj, h=header, f=footer, g=grad_line_regex):
                 if input_obj.keywords['gradient']:
                     df2 = pd.DataFrame(data=[G.flatten().tolist()],index=None, columns=grad_cols)
                     df = pd.concat([df, df2], axis=1)
-                data = data.append(df)
+                data = pd.concat([data, df])
                 if input_obj.keywords['pes_redundancy'] == 'true':
                     continue
                 else:

From a382882df727eeff97832cc53fa58b0adccf861f Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Thu, 2 Nov 2023 11:45:05 -0400
Subject: [PATCH 10/28] geom and inter. files now created w/ schema gen

---
 peslearn/datagen/configuration_space.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/peslearn/datagen/configuration_space.py b/peslearn/datagen/configuration_space.py
index 0ace3b3..8b355a0 100644
--- a/peslearn/datagen/configuration_space.py
+++ b/peslearn/datagen/configuration_space.py
@@ -337,6 +337,26 @@ def generate_schema(self):
     
             if not os.path.exists(str(i)):
                 os.mkdir(str(i))
+
+            # tag with internal coordinates, include duplicates if requested
+            with open("{}/geom".format(str(i)), 'w') as f:
+                tmp_dict = OrderedDict(zip(self.mol.geom_parameters, list(df.iloc[i-1]['internals'])))
+                f.write(json.dumps([tmp_dict]))
+                #f.write(json.dumps([df.iloc[i-1]['internals']])) 
+                if 'duplicate_internals' in df:
+                    for j in range(len(df.iloc[i-1]['duplicate_internals'])):
+                        f.write("\n")
+                        tmp_dict = OrderedDict(zip(self.mol.geom_parameters, df.iloc[i-1]['duplicate_internals'][j]))
+                        f.write(json.dumps([tmp_dict]))
+                        #f.write(json.dumps([df.iloc[i-1]['duplicate_internals'][j]])) 
+            # tag with interatomic distance coordinates, include duplicates if requested
+            with open("{}/interatomics".format(str(i)), 'w') as f:
+                f.write(json.dumps([OrderedDict(df.iloc[i-1][self.bond_columns])]))
+                if 'duplicate_interatomics' in df:
+                    for j in range(len(df.iloc[i-1]['duplicate_interatomics'])):
+                        f.write("\n") 
+                        f.write(json.dumps([df.iloc[i-1]['duplicate_interatomics'][j]])) 
+
             os.chdir(str(i))
 
             # write the input files to run with qcengine
@@ -355,7 +375,7 @@ def generate_schema(self):
                 f.write("driver = '%s'\nmodel = {'method':'%s', 'basis':'%s'}\nkeywords = %s\nprog = '%s'\n\n" % (driver, method, basis, keywords, prog))
                 f.write("atomic_inp = qcel.models.AtomicInput(molecule=molecule, driver=driver, model=model, keywords=keywords)\n\n")
                 f.write("atomic_res = qcng.compute(atomic_inp, prog)\n\n")
-                f.write("with open('output.dat','w') as f:\n\tpprint.pprint(atomic_res.dict(), f)")
+                f.write("with open('%s','w') as f:\n\tpprint.pprint(atomic_res.dict(), f)" % (self.input_obj.keywords['output_name']))
             os.chdir("../")
 
         print("Your PES inputs are now generated. Run the jobs in the {} directory and then parse.".format(pes_dir_name))

From f3dffc43b564ae1b3ffe85d363efcc2309ab82a8 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Tue, 12 Dec 2023 11:23:16 -0500
Subject: [PATCH 11/28] finished support for schema energy parsing

---
 peslearn/datagen/outputfile.py   | 36 ++++++++++++++++++++++++++++++++
 peslearn/input_processor.py      |  4 ++--
 peslearn/utils/parsing_helper.py |  2 +-
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/peslearn/datagen/outputfile.py b/peslearn/datagen/outputfile.py
index 8de513b..a3cfd26 100644
--- a/peslearn/datagen/outputfile.py
+++ b/peslearn/datagen/outputfile.py
@@ -105,6 +105,42 @@ def extract_energy_with_cclib(self, cclib_attribute, energy_index=-1):
             e /= constants.hartree2ev
         return e 
     
+    def extract_from_schema(self, driver):
+        """
+        Attempts to extract result from standard QCSchema output
+
+        Parameters
+        ---------
+        None
+
+        Returns
+        ---------
+        energy : float
+            The energy result from the 'return_result' item in the standard QCSchema output
+        gradient : np.array
+            A numpy array of floats representing the cartesian gradient from the 'return_result' item in the standard QCSchema output
+        """
+        if driver == "energy":
+            energy = None
+            energy = re.findall("\s\'return_result\'\:\s+(-\d+\.\d+)", self.output_str)
+            #add some statment for a failed computation
+            return energy
+
+        if driver == "hessian":
+            hessian = None
+
+            return hessian
+
+        if driver == "gradient":
+            gradient = None
+
+            return gradient
+        
+        if driver == "properties":
+            properties = None
+
+            return properties
+
     def extract_cartesian_gradient_with_regex(self, header, footer, grad_line_regex):
         """
         Extracts cartesian gradients according to user supplied regular expressions.
diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index 38b21bd..6384d0c 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -35,10 +35,10 @@ def get_keywords(self):
         # keywords which have values that are strings, not other datatypes
         regex_keywords = {'energy_regex': None, 'gradient_header': None, 'gradient_footer': None, 
                           'gradient_line': None, 'input_name': 'input.dat', 'output_name': 'output.dat', 'pes_dir_name': 'PES_data', 'pes_name': 'PES.dat'}
-        string_keywords = {'energy': None,                   # parse energies with 'cclib', 'regex'
+        string_keywords = {'energy': None,                   # parse energies with 'cclib', 'regex', or 'schema'
                            'energy_regex': None,             # a regular expression string, surround by '' or ""
                            'energy_cclib': None,             # a cclib energy option. 'scfenergies', 'mpenergies', 'ccenergies'
-                           'gradient': None,                 # parse gradients with 'cclib', 'regex'
+                           'gradient': None,                 # parse gradients with 'cclib', 'regex', or 'schema'
                            'gradient_header': None,          # gradient header regular expression string
                            'gradient_footer': None,          # gradient footer regular expression string
                            'gradient_line': None,            # regular expression string for one line of the cartesian gradient
diff --git a/peslearn/utils/parsing_helper.py b/peslearn/utils/parsing_helper.py
index 0d5c6cb..8b0d0a0 100644
--- a/peslearn/utils/parsing_helper.py
+++ b/peslearn/utils/parsing_helper.py
@@ -26,7 +26,7 @@ def extract_energy(input_obj, output_obj):
         
     elif input_obj.keywords['energy'] == 'schema':
         def extract_energy(input_obj, output_obj):
-            energy = output_obj.extract_energy_from_schema()
+            energy = output_obj.extract_from_schema(input_obj.keywords['schema_driver'])
             return energy
     
     # define gradient extraction routine based on user keywords

From a8f05356c7405625d6aaea0333bcd933fc3c702f Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Tue, 2 Jan 2024 14:02:01 -0500
Subject: [PATCH 12/28] added support to parse multiple data from schema

---
 peslearn/datagen/outputfile.py   | 45 ++++++++++++++------
 peslearn/utils/parsing_helper.py | 73 +++++++++++++++++++++++++++-----
 2 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/peslearn/datagen/outputfile.py b/peslearn/datagen/outputfile.py
index a3cfd26..f0d51fd 100644
--- a/peslearn/datagen/outputfile.py
+++ b/peslearn/datagen/outputfile.py
@@ -111,30 +111,51 @@ def extract_from_schema(self, driver):
 
         Parameters
         ---------
-        None
+        driver : str
+            The 'highest-order' result to parse from QCSchema output: 'energy', 'gradient', 'hessian' or 'properties'
 
         Returns
         ---------
-        energy : float
+        energy : str
             The energy result from the 'return_result' item in the standard QCSchema output
         gradient : np.array
             A numpy array of floats representing the cartesian gradient from the 'return_result' item in the standard QCSchema output
+        hessian : np.array
+
         """
         if driver == "energy":
-            energy = None
-            energy = re.findall("\s\'return_result\'\:\s+(-\d+\.\d+)", self.output_str)
+            energy = re.findall("\s\'return_energy\'\:\s+(-\d+\.\d+)", self.output_str)
             #add some statment for a failed computation
             return energy
 
-        if driver == "hessian":
-            hessian = None
-
-            return hessian
-
         if driver == "gradient":
-            gradient = None
+            gradient = re.findall("\s\'return_gradient\'\:\s+array\(([\s\S]*?)\)\,", self.output_str)
+            if gradient:
+                import ast
+                gradient = re.sub(r'\s+', "", str(gradient))
+                gradient = re.sub(r'\\n','',gradient)
+                gradient = re.sub(r'\[\'','',gradient)
+                gradient = re.sub(r'\'\]','',gradient)
+                gradient = np.asarray(ast.literal_eval(gradient)).astype(np.float64)
+                return gradient
+            else:
+                return None
+            #add failed computaiton
 
-            return gradient
+        if driver == "hessian":
+            hessian = re.findall("\s\'return_hessian\'\:\s+array\(([\s\S]*?)\)\,", self.output_str)
+            if hessian:
+                import ast
+                hessian = re.sub(r'\s+', "", str(hessian))
+                hessian = re.sub(r'\\n','',hessian)
+                hessian = re.sub(r'\[\'','',hessian)
+                hessian = re.sub(r'\'\]','',hessian)
+                hessian = np.asarray(ast.literal_eval(hessian)).astype(np.float64)
+                return hessian
+            else:
+                return None
+
+            #add failed computation error
         
         if driver == "properties":
             properties = None
@@ -188,7 +209,7 @@ def extract_cartesian_gradient_with_regex(self, header, footer, grad_line_regex)
         #TODO add catch for when only some lines of the gradient are parsed but not all, check against number of atoms or something
         if gradient:
             # this gradient is a list of tuples, each tuple is an x, y, z for one atom
-            gradient = np.asarray(gradient).astype(np.float)
+            gradient = np.asarray(gradient).astype(np.float64)
             return gradient        
         else:
             return None
diff --git a/peslearn/utils/parsing_helper.py b/peslearn/utils/parsing_helper.py
index 8b0d0a0..58031a9 100644
--- a/peslearn/utils/parsing_helper.py
+++ b/peslearn/utils/parsing_helper.py
@@ -24,10 +24,39 @@ def extract_energy(input_obj, output_obj):
         else:
             raise Exception("\n energy_regex value not assigned in input. Please add a regular expression which captures the energy value, e.g. energy_regex = 'RHF Final Energy: \s+(-\d+\.\d+)'")
         
-    elif input_obj.keywords['energy'] == 'schema':
+
+    if input_obj.keywords['energy'] == 'schema':
         def extract_energy(input_obj, output_obj):
-            energy = output_obj.extract_from_schema(input_obj.keywords['schema_driver'])
+            energy = output_obj.extract_from_schema(driver='energy')
             return energy
+
+    # define extractions from schema based on user keywords
+    # if input_obj.keywords['energy'] == 'schema' or input_obj.keywords['gradient'] == 'schema' or input_obj.keywords['hessian'] == 'schema':
+    #     def extract_energy(input_obj, output_obj):
+    #             if input_obj.keywords['schema_driver'] == 'energy':
+    #                 energy = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
+    #                 return energy
+    #             elif input_obj.keywords['schema_driver'] == 'gradient':
+    #                 if input_obj.keywords['energy'] == 'schema':
+    #                     energy, gradient = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
+    #                     return energy, gradient
+    #                 else:
+    #                     gradient = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
+    #                     return gradient
+    #             elif input_obj.keywords['schema_driver'] == 'hessian':
+    #                 if input_obj.keywords['energy'] == 'schema':
+    #                     if input_obj.keywords['gradient'] == 'schema':
+    #                         energy, gradient, hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
+    #                         return energy, gradient, hessian
+    #                     elif not input_obj.keywords['gradient'] == 'schema':
+    #                         energy, hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
+    #                         return energy, hessian
+    #                 elif input_obj.keywords['gradient'] == 'schema' and not input_obj.keywords['energy'] == 'schema':
+    #                     gradient, hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
+    #                     return gradient, hessian
+    #                 elif not input_obj.keywords['energy'] == 'schema' and not input_obj.keywords['gradient'] == 'schema':
+    #                     hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
+    #                     return hessian
     
     # define gradient extraction routine based on user keywords
     if input_obj.keywords['gradient'] == 'cclib':
@@ -50,10 +79,14 @@ def extract_gradient(output_obj, h=header, f=footer, g=grad_line_regex):
             raise Exception("For regular expression gradient extraction, gradient_header, gradient_footer, and gradient_line string identifiers are required to isolate the cartesian gradient block. See documentation for details")   
 
     elif input_obj.keywords['gradient'] == 'schema':
-        # add function to find gradient from schema
-        return None 
+        def extract_gradient(output_obj, input_obj):
+                gradient = output_obj.extract_from_schema(driver='gradient')
+                return gradient
     
-    #add function to parse hessian from schema
+    if input_obj.keywords['hessian'] == 'schema':
+        def extract_hessian(input_obj, output_obj):
+            hessian = output_obj.extract_from_schema(driver='hessian')
+            return hessian
 
     #add function to parse properties from schema
 
@@ -74,29 +107,43 @@ def extract_gradient(output_obj, h=header, f=footer, g=grad_line_regex):
         grad_cols = ["g%d" % (i) for i in range(ngrad)]
         for i in grad_cols:
             data[i] = ''
+    if input_obj.keywords['hessian']:
+        nhess = (3*(mol.n_atoms - mol.n_dummy))*(3*(mol.n_atoms - mol.n_dummy))
+        hess_cols = ["h%d" % (i) for i in range(nhess)]
+        for i in hess_cols:
+            data[i] = ''
+        
 
     # parse output files 
     os.chdir("./" + input_obj.keywords['pes_dir_name'])
     dirs = [i for i in os.listdir(".") if os.path.isdir(i) ]
     dirs = sorted(dirs, key=lambda x: int(x))
-    for d in dirs: 
-        #path = d + "/" + "output.dat" 
+    for d in dirs:  
         path = d + "/" + input_obj.keywords['output_name']
         output_obj = OutputFile(path)
         if input_obj.keywords['energy']:
             E = extract_energy(input_obj, output_obj)
         if input_obj.keywords['gradient']:
-            G = extract_gradient(output_obj)
+            G = extract_gradient(output_obj, input_obj)
             ngrad = 3*(mol.n_atoms - mol.n_dummy) 
             grad_cols = ["g%d" % (i) for i in range(ngrad)]
+        if input_obj.keywords['hessian']:
+            H = extract_hessian(input_obj, output_obj)
+            nhess = (3*(mol.n_atoms - mol.n_dummy))*(3*(mol.n_atoms - mol.n_dummy))
+            hess_cols = ["h%d" % (i) for i in range(nhess)]
+            
         with open(d + geom_path) as f:
             for line in f:
                 tmp = json.loads(line, object_pairs_hook=OrderedDict)
                 df = pd.DataFrame(data=tmp, index=None, columns=tmp[0].keys())
-                df['E'] = E
+                if input_obj.keywords['energy']:
+                    df['E'] = E
                 if input_obj.keywords['gradient']:
                     df2 = pd.DataFrame(data=[G.flatten().tolist()],index=None, columns=grad_cols)
                     df = pd.concat([df, df2], axis=1)
+                if input_obj.keywords['hessian']:
+                    df3 = pd.DataFrame(data=[H.flatten().tolist()], index=None, columns=hess_cols)
+                    df = pd.concat([df,df3], axis=1)
                 data = pd.concat([data, df])
                 if input_obj.keywords['pes_redundancy'] == 'true':
                     continue
@@ -105,6 +152,12 @@ def extract_gradient(output_obj, h=header, f=footer, g=grad_line_regex):
     os.chdir('../')
 
     if input_obj.keywords['sort_pes'] == 'true': 
-        data = data.sort_values("E")
+        if input_obj.keywords['gradient'] or input_obj.keywords['hessian']:
+            if input_obj.keywords['energy']:
+                data = data.sort_values("E")
+            else:
+                print("Keyword 'sort_pes' is set to 'true' (default), this only applies to energies and your data has NOT been sorted")
+        else:
+            data = data.sort_values("E")
     data.to_csv(input_obj.keywords['pes_name'], sep=',', index=False, float_format='%12.12f')
     print("Parsed data has been written to {}".format(input_obj.keywords['pes_name']))

From 280995bc6bc0788a96be20e28ad78d18f1c6e276 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Tue, 2 Jan 2024 14:17:37 -0500
Subject: [PATCH 13/28] added hessian keyword

---
 peslearn/input_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index 6384d0c..b303942 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -42,6 +42,7 @@ def get_keywords(self):
                            'gradient_header': None,          # gradient header regular expression string
                            'gradient_footer': None,          # gradient footer regular expression string
                            'gradient_line': None,            # regular expression string for one line of the cartesian gradient
+                           'hessian':None,                   # parse hessian with 'schema'
                            'input_name': 'input.dat',        # what to call new input files generated from template, can be any name
                            'output_name': 'output.dat',      # the name of electronic structure theory output files corresponding to input_name
                            'ml_model': 'gp',                 # 'gp', 'nn'

From d6969401c209bd5ad11182d32bffc4535f55a58a Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 10 Jan 2024 12:58:05 -0500
Subject: [PATCH 14/28] fixed parsing to handle if QCSchema errors

---
 peslearn/datagen/outputfile.py   | 40 ++++++++++++----
 peslearn/utils/parsing_helper.py | 80 +++++++++++++-------------------
 2 files changed, 64 insertions(+), 56 deletions(-)

diff --git a/peslearn/datagen/outputfile.py b/peslearn/datagen/outputfile.py
index f0d51fd..29bcdfa 100644
--- a/peslearn/datagen/outputfile.py
+++ b/peslearn/datagen/outputfile.py
@@ -112,21 +112,32 @@ def extract_from_schema(self, driver):
         Parameters
         ---------
         driver : str
-            The 'highest-order' result to parse from QCSchema output: 'energy', 'gradient', 'hessian' or 'properties'
+            The result to parse from QCSchema output: 'energy', 'gradient', 'hessian' or 'properties'
 
         Returns
         ---------
         energy : str
             The energy result from the 'return_result' item in the standard QCSchema output
         gradient : np.array
-            A numpy array of floats representing the cartesian gradient from the 'return_result' item in the standard QCSchema output
+            A numpy array of floats representing the cartesian gradient from the 'retun_gradient' item in the standard QCSchema output
         hessian : np.array
+            A numpy array of floats representing the hessian from the 'return_hessian' item in the standard QCSchema output
 
         """
         if driver == "energy":
+            energy = []
             energy = re.findall("\s\'return_energy\'\:\s+(-\d+\.\d+)", self.output_str)
-            #add some statment for a failed computation
-            return energy
+            if energy:
+                return energy
+            else:
+                success = re.findall("\s\'success\'\:\s+(\S+)\}", self.output_str)
+                if success[0] == 'False':
+                    energy = 'False'
+                    return energy
+                else:
+                    # if success is 'True' but energy was not found
+                    energy = 'False'
+                    return energy
 
         if driver == "gradient":
             gradient = re.findall("\s\'return_gradient\'\:\s+array\(([\s\S]*?)\)\,", self.output_str)
@@ -139,8 +150,14 @@ def extract_from_schema(self, driver):
                 gradient = np.asarray(ast.literal_eval(gradient)).astype(np.float64)
                 return gradient
             else:
-                return None
-            #add failed computaiton
+                success = re.findall("\s\'success\'\:\s+(\S+)\}", self.output_str)
+                if success[0] == 'False':
+                    gradient = 'False'
+                    return gradient
+                else:
+                    gradient = 'False'
+                    return gradient
+
 
         if driver == "hessian":
             hessian = re.findall("\s\'return_hessian\'\:\s+array\(([\s\S]*?)\)\,", self.output_str)
@@ -153,10 +170,15 @@ def extract_from_schema(self, driver):
                 hessian = np.asarray(ast.literal_eval(hessian)).astype(np.float64)
                 return hessian
             else:
-                return None
-
-            #add failed computation error
+                success = re.findall("\s\'success\'\:\s+(\S+)\}", self.output_str)
+                if success[0] == 'False':
+                    hessian = 'False'
+                    return hessian
+                else:
+                    hessian = 'False'
+                    return hessian
         
+        #TODO add support for properties from QCSchema outuput
         if driver == "properties":
             properties = None
 
diff --git a/peslearn/utils/parsing_helper.py b/peslearn/utils/parsing_helper.py
index 58031a9..f67f7a2 100644
--- a/peslearn/utils/parsing_helper.py
+++ b/peslearn/utils/parsing_helper.py
@@ -24,39 +24,10 @@ def extract_energy(input_obj, output_obj):
         else:
             raise Exception("\n energy_regex value not assigned in input. Please add a regular expression which captures the energy value, e.g. energy_regex = 'RHF Final Energy: \s+(-\d+\.\d+)'")
         
-
     if input_obj.keywords['energy'] == 'schema':
         def extract_energy(input_obj, output_obj):
             energy = output_obj.extract_from_schema(driver='energy')
             return energy
-
-    # define extractions from schema based on user keywords
-    # if input_obj.keywords['energy'] == 'schema' or input_obj.keywords['gradient'] == 'schema' or input_obj.keywords['hessian'] == 'schema':
-    #     def extract_energy(input_obj, output_obj):
-    #             if input_obj.keywords['schema_driver'] == 'energy':
-    #                 energy = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
-    #                 return energy
-    #             elif input_obj.keywords['schema_driver'] == 'gradient':
-    #                 if input_obj.keywords['energy'] == 'schema':
-    #                     energy, gradient = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
-    #                     return energy, gradient
-    #                 else:
-    #                     gradient = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
-    #                     return gradient
-    #             elif input_obj.keywords['schema_driver'] == 'hessian':
-    #                 if input_obj.keywords['energy'] == 'schema':
-    #                     if input_obj.keywords['gradient'] == 'schema':
-    #                         energy, gradient, hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
-    #                         return energy, gradient, hessian
-    #                     elif not input_obj.keywords['gradient'] == 'schema':
-    #                         energy, hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
-    #                         return energy, hessian
-    #                 elif input_obj.keywords['gradient'] == 'schema' and not input_obj.keywords['energy'] == 'schema':
-    #                     gradient, hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
-    #                     return gradient, hessian
-    #                 elif not input_obj.keywords['energy'] == 'schema' and not input_obj.keywords['gradient'] == 'schema':
-    #                     hessian = output_obj.extract_from_schema(input_obj.keywords['schema_driver'], input_obj.keywords)
-    #                     return hessian
     
     # define gradient extraction routine based on user keywords
     if input_obj.keywords['gradient'] == 'cclib':
@@ -115,6 +86,9 @@ def extract_hessian(input_obj, output_obj):
         
 
     # parse output files 
+    E = 0
+    G = 0
+    H = 0
     os.chdir("./" + input_obj.keywords['pes_dir_name'])
     dirs = [i for i in os.listdir(".") if os.path.isdir(i) ]
     dirs = sorted(dirs, key=lambda x: int(x))
@@ -131,24 +105,29 @@ def extract_hessian(input_obj, output_obj):
             H = extract_hessian(input_obj, output_obj)
             nhess = (3*(mol.n_atoms - mol.n_dummy))*(3*(mol.n_atoms - mol.n_dummy))
             hess_cols = ["h%d" % (i) for i in range(nhess)]
-            
-        with open(d + geom_path) as f:
-            for line in f:
-                tmp = json.loads(line, object_pairs_hook=OrderedDict)
-                df = pd.DataFrame(data=tmp, index=None, columns=tmp[0].keys())
-                if input_obj.keywords['energy']:
-                    df['E'] = E
-                if input_obj.keywords['gradient']:
-                    df2 = pd.DataFrame(data=[G.flatten().tolist()],index=None, columns=grad_cols)
-                    df = pd.concat([df, df2], axis=1)
-                if input_obj.keywords['hessian']:
-                    df3 = pd.DataFrame(data=[H.flatten().tolist()], index=None, columns=hess_cols)
-                    df = pd.concat([df,df3], axis=1)
-                data = pd.concat([data, df])
-                if input_obj.keywords['pes_redundancy'] == 'true':
-                    continue
-                else:
-                    break
+                
+        if E == 'False' or G == 'False' or H == 'False':
+            with open('errors.txt','a') as e:
+                    error_string = 'File in dir {} returned an error, the parsed output has been omitted from {}.\n'.format(d, input_obj.keywords['pes_name'])
+                    e.write(error_string)
+        else:
+            with open(d + geom_path) as f:
+                for line in f:
+                    tmp = json.loads(line, object_pairs_hook=OrderedDict)
+                    df = pd.DataFrame(data=tmp, index=None, columns=tmp[0].keys())
+                    if input_obj.keywords['energy']:
+                        df['E'] = E
+                    if input_obj.keywords['gradient']:
+                        df2 = pd.DataFrame(data=[G.flatten().tolist()],index=None, columns=grad_cols)
+                        df = pd.concat([df, df2], axis=1)
+                    if input_obj.keywords['hessian']:
+                        df3 = pd.DataFrame(data=[H.flatten().tolist()], index=None, columns=hess_cols)
+                        df = pd.concat([df,df3], axis=1)
+                    data = pd.concat([data, df])
+                    if input_obj.keywords['pes_redundancy'] == 'true':
+                        continue
+                    else:
+                        break
     os.chdir('../')
 
     if input_obj.keywords['sort_pes'] == 'true': 
@@ -161,3 +140,10 @@ def extract_hessian(input_obj, output_obj):
             data = data.sort_values("E")
     data.to_csv(input_obj.keywords['pes_name'], sep=',', index=False, float_format='%12.12f')
     print("Parsed data has been written to {}".format(input_obj.keywords['pes_name']))
+    # if num_errors > 0:
+    #     print("One or more output files returned an error, refer to {}/errors.txt for more information".format(input_obj.keywords['pes_dir_name']))
+
+
+    error_path = "./" + input_obj.keywords['pes_dir_name'] + "/errors.txt"
+    if os.path.exists(error_path):
+        print("One or more output files returned an error, refer to {}/errors.txt for more information".format(input_obj.keywords['pes_dir_name']))

From f490c0f26855d5349802add7109573022c5f2782 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Mon, 4 Mar 2024 13:25:19 -0500
Subject: [PATCH 15/28] Added KernelRidgeReg as new machine learning model

---
 peslearn/constants.py           |  55 +++++++
 peslearn/input_processor.py     |   5 +-
 peslearn/ml/__init__.py         |   1 +
 peslearn/ml/kernel_ridge_reg.py | 281 ++++++++++++++++++++++++++++++++
 setup.py                        |   2 +-
 5 files changed, 341 insertions(+), 3 deletions(-)
 create mode 100644 peslearn/ml/kernel_ridge_reg.py

diff --git a/peslearn/constants.py b/peslearn/constants.py
index 1eb1962..bf79e84 100644
--- a/peslearn/constants.py
+++ b/peslearn/constants.py
@@ -125,3 +125,58 @@ def cart1d_to_distances1d(vec):
     distance_vector = distance_matrix[np.tril_indices(len(distance_matrix),-1)]
     return distance_vector
 """    
+
+krr_convenience_funciton = """
+# How to use 'compute_energy()' function
+# --------------------------------------
+# E = compute_energy(geom_vectors, cartesian=bool)
+# 'geom_vectors' is either: 
+#  1. A list or tuple of coordinates for a single geometry. 
+#  2. A column vector of one or more sets of 1d coordinate vectors as a list of lists or 2D NumPy array:
+# [[ coord1, coord2, ..., coordn],
+#  [ coord1, coord2, ..., coordn],
+#      :       :             :  ], 
+#  [ coord1, coord2, ..., coordn]]
+# In all cases, coordinates should be supplied in the exact same format and exact same order the model was trained on.
+# If the coordinates format used to train the model was interatomic distances, each set of coordinates should be a 1d array of either interatom distances or cartesian coordinates. 
+# If cartesian coordinates are supplied, cartesian=True should be passed and it will convert them to interatomic distances. 
+# The order of coordinates matters. If PES-Learn datasets were used they should be in standard order;
+# i.e. cartesians should be supplied in the order x,y,z of most common atoms first, with alphabetical tiebreaker. 
+# e.g., C2H3O2 --> H1x H1y H1z H2x H2y H2z H3x H3y H3z C1x C1y C1z C2x C2y C2z O1x O1y O1z O2x O2y O2z
+# and interatom distances should be the row-wise order of the lower triangle of the interatom distance matrix, with standard order atom axes:
+#    H  H  H  C  C  O  O 
+# H 
+# H  1
+# H  2  3
+# C  4  5  6 
+# C  7  8  9  10 
+# O  11 12 13 14 15
+# O  16 17 18 19 20 21
+
+# The returned energy array is a column vector of corresponding energies. Elements can be accessed with E[0,0], E[0,1], E[0,2]
+# NOTE: Sending multiple geometries through at once is much faster than a loop of sending single geometries through.
+
+def pes(geom_vectors, cartesian=True):
+    g = np.asarray(geom_vectors)
+    if cartesian:
+        axis = 1
+        if len(g.shape) < 2:
+            axis = 0
+        g = np.apply_along_axis(cart1d_to_distances1d, axis, g)
+    newX = krr.transform_new_X(g, params, Xscaler)
+    E = model.predict(newX)
+    e = nn.inverse_transform_new_y(E, yscaler)
+    #e = e - (insert min energy here)
+    #e *= 219474.63  ( convert units )
+    return e
+
+def cart1d_to_distances1d(vec):
+    vec = vec.reshape(-1,3)
+    n = len(vec)
+    distance_matrix = np.zeros((n,n))
+    for i,j in combinations(range(len(vec)),2):
+        R = np.linalg.norm(vec[i]-vec[j])
+        distance_matrix[j,i] = R
+    distance_vector = distance_matrix[np.tril_indices(len(distance_matrix),-1)]
+    return distance_vector
+"""
diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index b303942..699f783 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -65,7 +65,7 @@ def get_keywords(self):
                            'hp_maxit': 20,                   # any int
                            'rseed': None,                    # any int
                            'gp_ard': 'true',                 # 'true', 'false'. 'opt' treats as hyperparameter
-                           'nas_trial_layers': None,         # List of lists e.g. [[10,], [10,10,10], [50,50]]
+                           'nas_trial_layers': None,         # List of tuples e.g. [(10,), (10,10,10), (50,50)]
                            'nn_precision': 32,               # neural network floating point precision 32 or 64
                            'hp_opt': 'true',                 # 'false'
                            'schema_generate' : 'false',      # 'true'
@@ -74,7 +74,8 @@ def get_keywords(self):
                            'schema_basis' : None,            # any basis interperetable by QC software of choice
                            'schema_driver' : 'energy',       # 'hessian', 'gradient', 'properties'
                            'schema_keywords' : None,         # any keywords interperetable by QC software of choice, python dictionary in quotes
-                           'schema_prog' : None              # any program supported by QCEngine
+                           'schema_prog' : None,             # any program supported by QCEngine
+                           'kernel' : None                   # None or 'verbose' to use only RBF kernel or all possible kernels
                             }
 
         for k in string_keywords:
diff --git a/peslearn/ml/__init__.py b/peslearn/ml/__init__.py
index bc3a5c0..4c33d1c 100644
--- a/peslearn/ml/__init__.py
+++ b/peslearn/ml/__init__.py
@@ -7,3 +7,4 @@
 from .gaussian_process import GaussianProcess
 from .data_sampler import DataSampler
 from .neural_network import NeuralNetwork
+from .kernel_ridge_reg import KernelRidgeReg
diff --git a/peslearn/ml/kernel_ridge_reg.py b/peslearn/ml/kernel_ridge_reg.py
new file mode 100644
index 0000000..2be1be4
--- /dev/null
+++ b/peslearn/ml/kernel_ridge_reg.py
@@ -0,0 +1,281 @@
+import numpy as np
+import sklearn.metrics
+import os
+import sys
+import re
+import json
+from sklearn.kernel_ridge import KernelRidge
+from hyperopt import fmin, tpe, hp, STATUS_OK, STATUS_FAIL, Trials, space_eval
+
+
+from .model import Model
+from..constants import package_directory, hartree2cm, krr_convenience_funciton
+from ..utils.printing_helper import hyperopt_complete
+from ..lib.path import fi_dir
+from .preprocessing_helper import morse, interatomics_to_fundinvar, degree_reduce, general_scaler
+
+
+class KernelRidgeReg(Model):
+    """
+    Constructs a Kernel Ridge Regression Model using scikit-learn
+    """
+    def __init__(self, dataset_path, input_obj, molecule_type=None, molecule=None, train_path=None, test_path=None, valid_path=None):
+        super().__init__(dataset_path, input_obj, molecule_type, molecule, train_path, test_path, valid_path)
+        self.set_default_hyperparameters()
+
+    def set_default_hyperparameters(self):
+        """
+        Set default hyperparameter space. If none is provided, default is used.
+        """
+        self.hyperparameter_space = {
+                                    'scale_X': hp.choice('scale_X', ['std', 'mm01', 'mm11', None]),
+                                    'scale_y': hp.choice('scale_y', ['std', 'mm01', 'mm11', None]),
+                                    }
+        
+        # Standard geometry transformations, always use these.
+        if self.input_obj.keywords['pes_format'] == 'interatomics':
+            self.set_hyperparameter('morse_transform', hp.choice('morse_transform',[{'morse': True,'morse_alpha': hp.quniform('morse_alpha', 1, 2, 0.1)},{'morse': False}]))
+        else:
+            self.set_hyperparameter('morse_transform', hp.choice('morse_transform',[{'morse': False}]))
+        if self.pip:
+            val =  hp.choice('pip',[{'pip': True,'degree_reduction': hp.choice('degree_reduction', [True,False])}])
+            self.set_hyperparameter('pip', val)
+        else:
+            self.set_hyperparameter('pip', hp.choice('pip', [{'pip': False}]))
+
+        # Kernel hyperparameters
+        self.set_hyperparameter('alpha', hp.choice('alpha', [1e-06, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6]))
+
+        # If 'kernel' keyword is 'None' (default) an rbf kernel will be used and only 'alpha' will be hyperparameter
+        if self.input_obj.keywords['kernel'] == None:
+            self.set_hyperparameter('kernel', hp.choice('kernel',[{'ktype': 'rbf', 'gamma': None, 'degree': None}]))
+
+        # If 'kernel' keywords is 'verbose' choice of kernel will be hyperparameter
+        elif self.input_obj.keywords['kernel'] == 'verbose':
+            self.set_hyperparameter('kernel', hp.choice('kernel', [
+                # {'ktype': 'chi2', 'gamma': hp.quniform('gamma', 0.5, 1.5, 0.1), 'degree': None},
+                {'ktype': 'polynomial', 'gamma': None, 'degree': hp.quniform('degree', 1, 5, 1)},
+                {'ktype': 'rbf', 'gamma': None, 'degree': None},
+                {'ktype': 'laplacian', 'gamma': None, 'degree': None},
+                {'ktype': 'sigmoid', 'gamma': None, 'degree': None},
+                {'ktype': 'cosine', 'gamma': None, 'degree': None}
+                ]))
+            
+        #TODO add option for 'precomputed' kernel
+        # If 'kernel' keywords is 'precomputed' only non-specified options will be hyperparameter
+        # elif self.input_obj.keywords['kernel'] == 'precomputed': ...
+
+    def split_train_test(self, params):
+        """
+        Take raw dataset and apply hyperparameters/input keywords/preprocessing 
+        and train/test (tr,test) splitting.
+        Assigns:
+        self.X : complete input data, transformed 
+        self.y : complete output data, trsnsformed
+        self.Xscaler : scaling transformer for inputs
+        self.yscaler : scaling transformer for outputs
+        self.Xtr : training input data, transformed
+        self.ytr : training output data, transformed
+        self.Xtest : test input data, transformed
+        self.ytext : test output data, transformed
+        """
+        self.X, self.y, self.Xscaler, self.yscaler = self.preprocess(params, self.raw_X, self.raw_y)
+        if self.sampler == 'user_supplied':
+            self.Xtr = self.transform_new_X(self.raw_Xtr, params, self.Xscaler)
+            self.ytr = self.transform_new_y(self.raw_ytr, self.yscaler)
+            self.Xtest = self.transform_new_X(self.raw_Xtest, params, self.Xscaler)
+            self.ytest = self.transform_new_y(self.raw_ytest, self.yscaler)
+        else: 
+            self.Xtr = self.X[self.train_indices]
+            self.ytr = self.y[self.train_indices]
+            self.Xtest = self.X[self.test_indices]
+            self.ytest = self.y[self.test_indices]
+
+    def optimize_model(self):
+        print("Beginning hyperparameter optimization...")
+        print("Trying {} combinations of hyperparameters".format(self.hp_maxit))
+        print("Training with {} points (Full dataset contains {} points).".format(self.ntrain, self.n_datapoints))
+        print("Using {} training set point sampling.".format(self.sampler))
+        print("Errors are root-mean-square error in wavenumbers (cm-1)")
+        self.hyperopt_trials = Trials()
+        self.itercount = 1  # keep track of hyperopt iterations 
+        if self.input_obj.keywords['rseed']:
+            rstate = np.random.RandomState(self.input_obj.keywords['rseed'])
+        else:
+            rstate = None
+        best = fmin(self.hyperopt_model,
+                    space=self.hyperparameter_space,
+                    algo=tpe.suggest,
+                    max_evals=self.hp_maxit*2,
+                    rstate=rstate,
+                    show_progressbar=False,
+                    trials=self.hyperopt_trials)
+        hyperopt_complete()
+        print("Best performing hyperparameters are:")
+        final = space_eval(self.hyperparameter_space, best)
+        print(str(sorted(final.items())))
+        self.optimal_hyperparameters  = dict(final)
+        print("Fine-tuning final model...")
+        self.build_model(self.optimal_hyperparameters)
+        print("Final model performance (cm-1):")
+        self.test_error = self.vet_model(self.model)
+        print("Model optimization complete. Saving final model...")
+        self.save_model(self.optimal_hyperparameters)
+
+    def build_model(self, params):
+        print("Hyperparameters: ", params)
+        self.split_train_test(params)
+        kernel = params['kernel']['ktype']
+        if params['kernel']['gamma']:
+            gamma = params['kernel']['gamma']
+        else:
+            gamma = None
+        if params['kernel']['degree']:
+            degree = int(params['kernel']['degree'])
+        else:
+            degree = 3
+        alpha = params['alpha']
+        self.model = KernelRidge(alpha=alpha, kernel=kernel, gamma=gamma, degree=degree)
+        print(self.ytr)
+        self.model = self.model.fit(self.Xtr, self.ytr)
+
+    def vet_model(self, model):
+        """
+        Convenience method for getting model errors of test and full datasets
+        """
+        pred_test, rsq = self.predict(model, self.Xtest, ytest=self.ytest)
+        pred_full = self.predict(model, self.X)
+        error_test = self.compute_error(self.ytest, pred_test, self.yscaler)
+        error_full, median_error, max_errors, e = self.compute_error(self.y, pred_full, yscaler=self.yscaler, max_errors=5)
+        print("R^2 {}".format(rsq))
+        print("Test Dataset {}".format(round(hartree2cm * error_test,2)), end='  ')
+        print("Full Dataset {}".format(round(hartree2cm * error_full,2)), end='     ')
+        print("Median error: {}".format(np.round(median_error[0],2)), end='  ')
+        print("Max 5 errors: {}".format(np.sort(np.round(max_errors.flatten(),1))),'\n')
+        error_test_invcm = round(hartree2cm * error_test,2)
+        return error_test_invcm
+    
+    def predict(self, model, data_in, ytest=None):
+        prediciton = model.predict(data_in)
+        # compute R-squared if requested
+        if ytest is not None:
+            rsq = model.score(data_in, ytest)
+            return prediciton, rsq
+        else:
+            return prediciton
+
+    def hyperopt_model(self, params):
+        """
+        Hyperopt-friendly wrapper for build_model
+        """
+        # skip building this model if hyperparameter combination already attempted
+        for i in self.hyperopt_trials.results:
+            if 'memo' in i:
+                if params == i['memo']:
+                    return {'loss': i['loss'], 'status': STATUS_OK, 'memo': 'repeat'}
+        if self.itercount > self.hp_maxit:
+            return {'loss': 0.0, 'status': STATUS_FAIL, 'memo': 'max iters reached'}
+        self.build_model(params)
+        error_test = self.vet_model(self.model)
+        self.itercount += 1
+        return {'loss': error_test, 'status': STATUS_OK, 'memo': params}
+
+    def preprocess(self, params, raw_X, raw_y):
+        """
+        Preprocess raw data according to hyperparameters
+        """
+        if params['morse_transform']['morse']:
+            raw_X = morse(raw_X, params['morse_transform']['morse_alpha'])
+        if params['pip']['pip']:
+            # find path to fundamental invariants from molecule type AxByCz...
+            path = os.path.join(fi_dir, self.molecule_type, "output")
+            raw_X, degrees = interatomics_to_fundinvar(raw_X, path)
+            if params['pip']['degree_reduction']:
+                raw_X = degree_reduce(raw_X, degrees)
+        if params['scale_X']:
+            X, Xscaler = general_scaler(params['scale_X'], raw_X)
+        else:
+            X = raw_X
+            Xscaler = None
+        if params['scale_y']:
+            y, yscaler = general_scaler(params['scale_y'], raw_y)
+        else:
+            y = raw_y
+            yscaler = None
+        return X, y, Xscaler, yscaler
+
+    def save_model(self, params):
+        print("Saving ML model data...")
+        model_path = "model1_data"
+        while os.path.isdir(model_path):
+            new = int(re.findall("\d+", model_path)[0]) +  1
+            model_path = re.sub("\d+", str(new), model_path)
+        os.mkdir(model_path)
+        os.chdir(model_path)
+        with open('hyperparameters', 'w') as f:
+            print(params, file=f)
+        from joblib import dump
+        dump(self.model, 'model.joblib')
+        
+        if self.sampler == 'user_supplied':
+            self.traindata.to_csv('train_set',sep=',',index=False,float_format='%12.12f')
+            self.testdata.to_csv('test_set', sep=',', index=False, float_format='%12.12f')
+        else:
+            self.dataset.iloc[self.train_indices].to_csv('train_set',sep=',',index=False,float_format='%12.12f')
+            self.dataset.iloc[self.test_indices].to_csv('test_set', sep=',', index=False, float_format='%12.12f')
+    
+        self.dataset.to_csv('PES.dat', sep=',',index=False,float_format='%12.12f')
+        # write convenience function
+        with open('compute_energy.py', 'w+') as f:
+            print(self.write_convenience_function(), file=f)
+
+        # print model performance
+        sys.stdout = open('performance', 'w')  
+        self.vet_model(self.model)
+        sys.stdout = sys.__stdout__
+        os.chdir("../")
+
+
+    def transform_new_X(self, newX, params, Xscaler=None):
+        """
+        Transform a new, raw inpur according to the model's transformation procedure
+        so that prediction can be made.
+        """
+        # ensure x dimension is n x m (n new points, m input variables)
+        if len(newX.shape) == 1:
+            newX = np.expand_dims(newX, 0)
+        elif len(newX) > 2:
+            raise Exception("Dimensions of input data is incorrect.")
+        if params['morse_transform']['morse']:
+            newX = morse(newX, params['morse_transform']['morse_alpha'])
+        if params['pip']['pip']:
+            # find path to fundamental invariants for an N atom subsystem with molecule type AxByCz...
+            path = os.path.join(package_directory, "lib", self.molecule_type, "output")
+            newX, degrees = interatomics_to_fundinvar(newX, degrees)
+        if Xscaler:
+            newX = Xscaler.transform(newX)
+        return newX
+
+    def transform_new_y(self, newy, yscaler=None):
+        if yscaler:
+            newy = yscaler.transform(newy)
+        return newy
+    
+    def inverse_transform_new_y(self, newy, yscaler=None):
+        if yscaler:
+            newy = yscaler.transform(newy)
+        return newy
+
+    def write_convenience_function(self):
+        string = "from peslearn.ml import KernelRidgeReg\nfrom peslearn import InputProcessor\nfrom sklearn.kernel_ridge import KernelRidge\nimport numpy as np\nifrom joblib import load\nfrom itertools import combinationa\n\n"
+        if self.pip:
+            string += "krr = KernelRidgeReg('PES.dat', InputProcessor(''), molecule_type='{}')\n".format(self.molecule_type)
+        else:
+            string += "krr = KernelRidgeReg('PES.dat', InputProcessor(''))\n"
+        with open('hyperparameters', 'r') as f:
+            hyperparameters = f.read()
+        string += "params = {}\n".format(hyperparameters)
+        string += "X, y, Xscaler, yscaler = krr.preprocess(params, krr.raw_X, krr.raw_y)\n"
+        string += "model = load(model.joblib)"
+        string += krr_convenience_funciton
+        return string
diff --git a/setup.py b/setup.py
index bf450d0..49adff4 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
         license='BSD-3C',
         packages=setuptools.find_packages(),
         install_requires=[
-            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1', 'qcelemental>=0.27.1', 'qcengine>=0.26.0'
+            'numpy>=1.7','GPy>=1.9','scikit-learn>=0.20','pandas>=0.24','hyperopt>=0.1.1','cclib>=1.6', 'torch>=1.0.1', 'joblib>=1.3.0', 'qcelemental>=0.27.1', 'qcengine>=0.26.0'
         ],
         extras_require={
             'docs': [

From e05e819e1c0661eb0205680ed7d0deb72679b303 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Mon, 4 Mar 2024 13:34:33 -0500
Subject: [PATCH 16/28] fixed typo about units in comments

---
 peslearn/ml/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/peslearn/ml/model.py b/peslearn/ml/model.py
index 4772074..0dbcb14 100644
--- a/peslearn/ml/model.py
+++ b/peslearn/ml/model.py
@@ -19,6 +19,7 @@ class Model(ABC):
     Subclasses which inherit from Model: 
     - GaussianProcess
     - NeuralNetwork
+    - KernelRidgeReg
 
     Parameters
     ----------
@@ -183,7 +184,7 @@ def compute_error(self, known_y, prediction, yscaler=None, max_errors=None):
         Returns
         -------
         error : float
-            Root mean square error in wavenumbers (cm-1)
+            Root mean square error in energy units provided (typically Hartree)
         """
         if known_y.shape != prediction.shape:
             raise Exception("Shape of known_y and prediction must be the same")

From d7cbda726c7c64389c19ec623134025607932a5e Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Mon, 4 Mar 2024 13:44:49 -0500
Subject: [PATCH 17/28] specified dtype for numpy asarray

---
 peslearn/ml/preprocessing_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/peslearn/ml/preprocessing_helper.py b/peslearn/ml/preprocessing_helper.py
index 2dd73be..4f1be6c 100644
--- a/peslearn/ml/preprocessing_helper.py
+++ b/peslearn/ml/preprocessing_helper.py
@@ -100,7 +100,7 @@ def sort_architectures(layers, inp_dim):
         size += out_dim * struct[-1]
         sizes.append(size)
     sorted_indices = np.argsort(sizes).tolist()
-    layers = np.asarray(layers)
+    layers = np.asarray(layers, dtype=object)
     layers = layers[sorted_indices].tolist()
     return layers
 

From e19ab6a8ca2032c929163f1a385733694a89c1ba Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Tue, 5 Mar 2024 11:23:43 -0500
Subject: [PATCH 18/28] added options for custom kernel with KRR

---
 peslearn/ml/kernel_ridge_reg.py | 107 +++++++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 15 deletions(-)

diff --git a/peslearn/ml/kernel_ridge_reg.py b/peslearn/ml/kernel_ridge_reg.py
index 2be1be4..cc2ca90 100644
--- a/peslearn/ml/kernel_ridge_reg.py
+++ b/peslearn/ml/kernel_ridge_reg.py
@@ -46,11 +46,11 @@ def set_default_hyperparameters(self):
         # Kernel hyperparameters
         self.set_hyperparameter('alpha', hp.choice('alpha', [1e-06, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6]))
 
-        # If 'kernel' keyword is 'None' (default) an rbf kernel will be used and only 'alpha' will be hyperparameter
+        # if 'kernel' keyword is 'None' (default) an rbf kernel will be used and only 'alpha' will be hyperparameter
         if self.input_obj.keywords['kernel'] == None:
             self.set_hyperparameter('kernel', hp.choice('kernel',[{'ktype': 'rbf', 'gamma': None, 'degree': None}]))
 
-        # If 'kernel' keywords is 'verbose' choice of kernel will be hyperparameter
+        # if 'kernel' keyword is 'verbose' choice of kernel will be hyperparameter
         elif self.input_obj.keywords['kernel'] == 'verbose':
             self.set_hyperparameter('kernel', hp.choice('kernel', [
                 # {'ktype': 'chi2', 'gamma': hp.quniform('gamma', 0.5, 1.5, 0.1), 'degree': None},
@@ -61,9 +61,78 @@ def set_default_hyperparameters(self):
                 {'ktype': 'cosine', 'gamma': None, 'degree': None}
                 ]))
             
-        #TODO add option for 'precomputed' kernel
-        # If 'kernel' keywords is 'precomputed' only non-specified options will be hyperparameter
-        # elif self.input_obj.keywords['kernel'] == 'precomputed': ...
+        #TODO add option for coef0 from scikit-learn docs
+        # if 'kernel' keyword is 'precomputed' choose hyperparamaters accordingly
+        elif self.input_obj.keywords['kernel'] == 'precomputed':
+            if self.input_obj.keywords['precomputed_kernel']:
+                precomputed_kernel = self.input_obj.keywords['precomputed_kernel']
+                if 'kernel' in precomputed_kernel:
+                    kernels = list(precomputed_kernel['kernel'])
+                    self.set_hyperparameter('kernel', hp.choice('kernel', kernels))
+                    if 'polynomial' in kernels or 'poly' in kernels:
+                        print("WARNING: Polynomial type kernels are included in this hyperoptimization.")
+                        print("\t It is strongly cautioned against optimizing polynomial kernels in a precomputed kernel along with other types of kernels.")
+                        print("\t See KRR docs for more info.")
+                        # add link to docs?
+                    if 'degree' in precomputed_kernel:
+                        degrees = np.asarray(precomputed_kernel['degree'])
+                        if degrees[0] == 'uniform':
+                            self.set_hyperparameter('degree', hp.quniform('degree', int(degrees[1]), int(degrees[2]), int(degrees[3])))
+                        else:
+                            degrees.astype(np.float64)
+                            self.set_hyperparameter('degree', hp.choice('degree', degrees))
+                    else:
+                        if 'polynomial' in kernels or 'poly' in kernels:
+                            self.set_hyperparameter('degree', hp.quniform('degree', 1, 5, 1))
+                        else:
+                            self.set_hyperparameter('degree', 1)
+                else:
+                    if 'degree' in precomputed_kernel:
+                        degrees = np.asarray(precomputed_kernel['degree'])
+                        if degrees[0] == 'uniform':
+                            self.set_hyperparameter('kernel', hp.choice('kernel', [
+                                {'kernel': 'polynomial', 'degree':  hp.quniform('degree', int(degrees[1]), int(degrees[2]), int(degrees[3]))},
+                                {'kernel': 'rbf', 'degree': 1},
+                                {'kernel': 'laplacian', 'degree': 1},
+                                {'kernel': 'sigmoid', 'degree': 1},
+                                {'kernel': 'cosine', 'degree': 1}
+                                ]))
+                        else:
+                            degrees.astype(np.float64)
+                            self.set_hyperparameter('kernel', hp.choice('kernel', [
+                                {'kernel': 'polynomial', 'degree':  hp.choice('degree', degrees)},
+                                {'kernel': 'rbf', 'degree': 1},
+                                {'kernel': 'laplacian', 'degree': 1},
+                                {'kernel': 'sigmoid', 'degree': 1},
+                                {'kernel': 'cosine', 'degree': 1}
+                                ]))
+                    else:
+                        self.set_hyperparameter('kernel', hp.choice('kernel', [
+                                {'kernel': 'polynomial', 'degree':  hp.quniform('degree', 1, 5, 1)},
+                                {'kernel': 'rbf', 'degree': 1},
+                                {'kernel': 'laplacian', 'degree': 1},
+                                {'kernel': 'sigmoid', 'degree': 1},
+                                {'kernel': 'cosine', 'degree': 1}
+                                ]))
+
+                if 'gamma' in precomputed_kernel:
+                    gammas = np.asarray(precomputed_kernel['gamma'])
+                    if gammas[0] == 'uniform':
+                        self.set_hyperparameter('gamma', hp.quniform('gamma', float(gammas[1]), float(gammas[2]), float(gammas[3])))
+                    else:
+                        gammas.astype(np.float64)
+                        self.set_hyperparameter('gamma', hp.choice('gamma', gammas))
+                else:
+                    self.set_hyperparameter('gamma', None)
+
+                if 'alpha' in precomputed_kernel:
+                    alphas = np.asarray(precomputed_kernel['alpha'])
+                    if alphas[0] == 'uniform':
+                        self.set_hyperparameter('alpha', hp.quniform('alpha', float(alphas[1]), float(alphas[2]), float(alphas[3])))
+                    else:
+                        alphas = alphas.astype(np.float64)
+                        self.set_hyperparameter('alpha', hp.choice('alpha', alphas))
+
 
     def split_train_test(self, params):
         """
@@ -125,18 +194,26 @@ def optimize_model(self):
     def build_model(self, params):
         print("Hyperparameters: ", params)
         self.split_train_test(params)
-        kernel = params['kernel']['ktype']
-        if params['kernel']['gamma']:
-            gamma = params['kernel']['gamma']
-        else:
-            gamma = None
-        if params['kernel']['degree']:
-            degree = int(params['kernel']['degree'])
+        if self.input_obj.keywords['kernel'] == 'precomputed':
+            gamma = params['gamma']
+            if 'kernel' not in self.input_obj.keywords['precomputed_kernel']:
+                degree = int(params['kernel']['degree'])
+                kernel = params['kernel']['kernel']
+            else:
+                degree = int(params['degree'])
+                kernel = params['kernel']
         else:
-            degree = 3
+            kernel = params['kernel']['ktype']
+            if params['kernel']['gamma']:
+                gamma = params['kernel']['gamma']
+            else:
+                gamma = None
+            if params['kernel']['degree']:
+                degree = int(params['kernel']['degree'])
+            else:
+                degree = 3
         alpha = params['alpha']
         self.model = KernelRidge(alpha=alpha, kernel=kernel, gamma=gamma, degree=degree)
-        print(self.ytr)
         self.model = self.model.fit(self.Xtr, self.ytr)
 
     def vet_model(self, model):
@@ -146,7 +223,7 @@ def vet_model(self, model):
         pred_test, rsq = self.predict(model, self.Xtest, ytest=self.ytest)
         pred_full = self.predict(model, self.X)
         error_test = self.compute_error(self.ytest, pred_test, self.yscaler)
-        error_full, median_error, max_errors, e = self.compute_error(self.y, pred_full, yscaler=self.yscaler, max_errors=5)
+        error_full, median_error, max_errors = self.compute_error(self.y, pred_full, yscaler=self.yscaler, max_errors=5)
         print("R^2 {}".format(rsq))
         print("Test Dataset {}".format(round(hartree2cm * error_test,2)), end='  ')
         print("Full Dataset {}".format(round(hartree2cm * error_full,2)), end='     ')

From 509a3a0ff0950e6ece53e9c2ba2df9a099fa19f1 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 6 Mar 2024 11:13:01 -0500
Subject: [PATCH 19/28] cleaned up keywords

---
 3_Keywords/keywords.md      | 225 +++++++++++++++++++-----------------
 peslearn/input_processor.py |  41 +++----
 2 files changed, 138 insertions(+), 128 deletions(-)

diff --git a/3_Keywords/keywords.md b/3_Keywords/keywords.md
index e878d20..dbdc719 100644
--- a/3_Keywords/keywords.md
+++ b/3_Keywords/keywords.md
@@ -38,29 +38,22 @@ When using command line interface `python path/to/peslearn/driver.py`, to specif
 * `mode = generate`, `mode = parse`, or `mode = learn`, or the corresponding shorthand `mode = g` `mode = p`, `mode = l`  
 If this keyword is not used, the software will ask what you want to do.
 
-## Data Generation Keywords
-
-* `input_name`  
-  **Description:** The name of generated input files for electronic structure theory packages. 
-    * **Type**: string, surrounded by quotes
-    * **Default**: 'input.dat'
-    * **Possible values**: any string
-
-
-* `output_name`  
-  **Description:**  The name of electronic structure theory package output (log) files which PES-Learn will attempt to parse.
-    * **Type**: string, surrounded by quotes
-    * **Default**: 'output.dat'
-    * **Possible values**: any string  
-
+## Data Generation Keywords (in alphabetical order)
 
 * `energy`  
-  **Description:** Energy parsing method, regular expressions or cclib.
+  **Description:** Energy parsing method, regular expressions, cclib, or schema. 
     * **Type**: string
     * **Default**: None
-    * **Possible values**: regex, cclib 
+    * **Possible values**: regex, cclib, schema
 
 
+* `energy_cclib`  
+  **Description:** Use cclib to parse energies from output files. Takes the last occurance captured by cclib.  
+    * **Type**: string
+    * **Default**: None
+    * **Possible values**: scfenergies, mpenergies, ccenergies
+
+    
 * `energy_regex`  
   **Description:**  Regular expression pattern which captures electronic energy from an electronic structure theory code output file. Always takes the last occuring match in the output file.  Floating point numbers `(?-\d+\.\d+)` should be surrounded by parentheses to capture just the number. It is recommended to check your regular expressions with [Pythex](https://pythex.org/). Simply copy the part of the output file you are trying to capture as well as your trial regular expression to see if it properly captures the energy. 
     * **Type**: string, surrounded by quotes
@@ -68,13 +61,34 @@ If this keyword is not used, the software will ask what you want to do.
     * **Possible values**: Any regular expression string.
 
 
-* `energy_cclib`  
-  **Description:** Use cclib to parse energies from output files. Takes the last occurance captured by cclib.  
-    * **Type**: string
+* `eq_geom`  
+  **Description:** Forces this one geometry (typically equilibrium geometry) into the dataset. Internal coordinates are supplied in the order they appear in the Z-Matrix of the input file.  
+    * **Type**: list
     * **Default**: None
-    * **Possible values**: scfenergies, mpenergies, ccenergies
+    * **Possible values**: `[1.0, 1.0, 104.5, 1.5, 120, 180]`, etc.
+
+
+* `grid_reduction`  
+  **Description:** Reduce the size of the internal coordinate grid to _n_ points. Acts **after** redundancy removal. Analyzes Euclidean distances between all datapoints, and creates a sub-grid of *n* geometries which are maximally far apart from one another.
+    * **Type**: int
+    * **Default**: None
+    * **Possible values**: any integer, less than the total number of points in the internal coordinate grid after redundancies are removed.
+
+
+* `input_name`  
+  **Description:** The name of generated input files for electronic structure theory packages. 
+    * **Type**: string, surrounded by quotes
+    * **Default**: 'input.dat'
+    * **Possible values**: any string
+
+
+* `output_name`  
+  **Description:**  The name of electronic structure theory package output (log) files which PES-Learn will attempt to parse.
+    * **Type**: string, surrounded by quotes
+    * **Default**: 'output.dat'
+    * **Possible values**: any string  
+
 
-     
 * `pes_dir_name`  
   **Description:**  The name of the directory containing all electronic structure theory package input and/or output files. Used both when generating and parsing data.
     * **Type**: string, surrounded by quotes
@@ -96,13 +110,13 @@ If this keyword is not used, the software will ask what you want to do.
     * **Possible values**: any string 
 
 
-* `remove_redundancy`  
-  **Description:**  Removes symmetry-redundant geometries from internal coordinate grid 
+* `pes_redundancy`  
+  **Description:** Include all redundant geometries and assign appropriate redundant energies when creating a dataset with parsing capability. Doesn't do anything unless `remember_redundancy` was set to true when data was generated.
     * **Type**: bool
-    * **Default**: true
-    * **Possible values**: true, false
-    
-    
+    * **Default**: false
+    * **Possible values**: true, false 
+
+
 * `remember_redundancy`  
   **Description:**  Remember symmetry-redundant geometries when they are removed using `remove_redundancy`. This is done so that redundant geometries can be included in the dataset created when parsing, and assigned the appropriate energy of its redundant partner whos energy was actually computed. These geometries are included when parsing only if `pes_redundancy` is set to true.
     * **Type**: bool
@@ -110,50 +124,19 @@ If this keyword is not used, the software will ask what you want to do.
     * **Possible values**: true, false
 
 
-* `pes_redundancy`  
-  **Description:** Include all redundant geometries and assign appropriate redundant energies when creating a dataset with parsing capability. Doesn't do anything unless `remember_redundancy` was set to true when data was generated.
+* `remove_redundancy`  
+  **Description:**  Removes symmetry-redundant geometries from internal coordinate grid 
     * **Type**: bool
-    * **Default**: false
-    * **Possible values**: true, false 
-
+    * **Default**: true
+    * **Possible values**: true, false
 
-* `grid_reduction`  
-  **Description:** Reduce the size of the internal coordinate grid to _n_ points. Acts **after** redundancy removal. Analyzes Euclidean distances between all datapoints, and creates a sub-grid of *n* geometries which are maximally far apart from one another.
-    * **Type**: int
-    * **Default**: None
-    * **Possible values**: any integer, less than the total number of points in the internal coordinate grid after redundancies are removed.
- 
- 
-* `eq_geom`  
-  **Description:** Forces this one geometry (typically equilibrium geometry) into the dataset. Internal coordinates are supplied in the order they appear in the Z-Matrix of the input file.  
-    * **Type**: list
-    * **Default**: None
-    * **Possible values**: `[1.0, 1.0, 104.5, 1.5, 120, 180]`, etc.
-    
 
 * `sort_pes`  
   **Description:** When parsing to produce a dataset, sort the energies in increasing order.  
     * **Type**: bool
     * **Default**: true
-    * **Possible values**: true, false
-
-* `schema_generate`  
-  **Description:** Generate input files that will run with QCEngine to produce QCSchema outputs.  
-    * **Type**: bool
-    * **Default**: false
-    * **Possible values**: true, false
-
-* `schema_units`  
-  **Description:** The units of the provided Z-Matrix input. QCEngine expects input units of Angstroms so Bohr will be converted. 
-    * **Type**: string
-    * **Default**: angstrom
-    * **Possible values**: bohr, angstrom
+    * **Possible values**: true, false   
 
-* `schema_method`  
-  **Description:** Any method that can be interpreted by the quantum chemical software of choice. 
-    * **Type**: string
-    * **Default**: None
-    * **Possible values**: any string, e.g. 'hf', 'ccsd', etc.
 
 * `schema_basis`  
   **Description:** Any basis that can be interpreted by the quantum chemical software of choice. 
@@ -161,18 +144,35 @@ If this keyword is not used, the software will ask what you want to do.
     * **Default**: None
     * **Possible values**: any string, e.g. 'sto-3g', 'cc-pvdz', etc.
 
+
 * `schema_driver`  
   **Description:** The type of computation for QCEngine to run.
     * **Type**: string
     * **Default**: 'energy'
     * **Possible values**: 'energy', 'hessian', 'gradient', 'properties'
 
+
+* `schema_generate`  
+  **Description:** Generate input files that will run with QCEngine to produce QCSchema outputs.  
+    * **Type**: bool
+    * **Default**: false
+    * **Possible values**: true, false
+
+
 * `schema_keywords`  
   **Description:** A python dictionary surrounded by quotes containing keywords to be used by the quantum chemical software of choice.
     * **Type**: dict, surrounted by quotes
     * **Default**: None
     * **Possible values**: any dict surrounded by quotes e.g. "{e_convergence': '1e-4', 'maxiter': '30'}"
 
+
+* `schema_method`  
+  **Description:** Any method that can be interpreted by the quantum chemical software of choice. 
+    * **Type**: string
+    * **Default**: None
+    * **Possible values**: any string, e.g. 'hf', 'ccsd', etc.
+
+
 * `schema_prog`  
   **Description:** The quantum chemical program to run the desired computation, must be a program supported by QCEngine. 
     * **Type**: string
@@ -180,41 +180,21 @@ If this keyword is not used, the software will ask what you want to do.
     * **Possible values**: any string e.g. 'psi4'
     
 
-## Machine Learning Keywords
-
-* `ml_model`  
-  **Description:** Use Gaussian process regression or neural networks?  
+* `schema_units`  
+  **Description:** The units of the provided Z-Matrix input. QCEngine expects input units of Angstroms so Bohr will be converted. 
     * **Type**: string
-    * **Default**: gp
-    * **Possible values**: gp, nn
-    
+    * **Default**: angstrom
+    * **Possible values**: bohr, angstrom
 
-* `use_pips`  
-  **Description:** Use software's library of fundamental invariant polynomials to represent the interatomic distances dataset in terms of permutation invariant polynomials. Requires that the dataset is an interatomic distance dataset produced by PES-Learn, or a properly formatted Cartesian coordinate external dataset. 
-    * **Type**: bool
-    * **Default**: true
-    * **Possible values**: any string
 
+## Machine Learning Keywords (in alphabetical order)
 
-* `sampling`  
-  **Description:** Training set sampling algorithm  
-    * **Type**: string
-    * **Default**: structure_based
-    * **Possible values**: structure_based, smart_random, random
-    
+* `gp_ard`  
+  **Description:** Use auto-relevancy determination (ARD) in Gaussian process regression. If True, a length scale is optimized for each input value. If false, just one length scale is optimized.  If gp_ard = opt, it is treated as a hyperparameter. False is typically better for high-dimensional inputs (>30).
+    * **Type**: bool
+    * **Default**: true
+    * **Possible values**: true, false, or opt (treats as hyperparameter)
 
-* `training_points`  
-  **Description:** Number of training points 
-    * **Type**: int
-    * **Default**: 50
-    * **Possible values**: any int smaller than total dataset size.
-    
-* `validation_points`  
-  **Description:** Number of validation points. Currently only used for neural networks.
-    * **Type**: int
-    * **Default**: Random set of half the points remaining after setting aside training set.
-    * **Possible values**: Any positive integer smaller than (total dataset size - `training_points`).
-    
 
 * `hp_maxit`  
   **Description:** Maximum number of hyperparameter tuning iterations.  
@@ -223,18 +203,11 @@ If this keyword is not used, the software will ask what you want to do.
     * **Possible values**: Any positive integer
 
 
-* `rseed`  
-  **Description:**  Global random seed. Used for initializing hyperparameter optimization iterations, random training set sampling.
-    * **Type**: int
-    * **Default**: None
-    * **Possible values**: Any integer
-
-
-* `gp_ard`  
-  **Description:** Use auto-relevancy determination (ARD) in Gaussian process regression. If True, a length scale is optimized for each input value. If false, just one length scale is optimized.  If gp_ard = opt, it is treated as a hyperparameter. False is typically better for high-dimensional inputs (>30).
-    * **Type**: bool
-    * **Default**: true
-    * **Possible values**: true, false, or opt (treats as hyperparameter)
+* `ml_model`  
+  **Description:** Use Gaussian process regression or neural networks?  
+    * **Type**: string
+    * **Default**: gp
+    * **Possible values**: gp, nn
 
 
 * `nas_trial_layers`  
@@ -249,10 +222,46 @@ If this keyword is not used, the software will ask what you want to do.
     * **Type**: int
     * **Default**: 32
     * **Possible values**: 32, 64
+
+
+* `rseed`  
+  **Description:**  Global random seed. Used for initializing hyperparameter optimization iterations, random training set sampling.
+    * **Type**: int
+    * **Default**: None
+    * **Possible values**: Any integer
+
+
+* `sampling`  
+  **Description:** Training set sampling algorithm  
+    * **Type**: string
+    * **Default**: structure_based
+    * **Possible values**: structure_based, smart_random, random
+
+
+* `training_points`  
+  **Description:** Number of training points 
+    * **Type**: int
+    * **Default**: 50
+    * **Possible values**: any int smaller than total dataset size.
+
+
+* `use_pips`  
+  **Description:** Use software's library of fundamental invariant polynomials to represent the interatomic distances dataset in terms of permutation invariant polynomials. Requires that the dataset is an interatomic distance dataset produced by PES-Learn, or a properly formatted Cartesian coordinate external dataset. 
+    * **Type**: bool
+    * **Default**: true
+    * **Possible values**: any string
+
+   
+* `validation_points`  
+  **Description:** Number of validation points. Currently only used for neural networks.
+    * **Type**: int
+    * **Default**: Random set of half the points remaining after setting aside training set.
+    * **Possible values**: Any positive integer smaller than (total dataset size - `training_points`).
     
+   
 * ``  
   **Description:**  
-    * **Type**: string
+    * **Type**: 
     * **Default**: 
-    * **Possible values**: any string
+    * **Possible values**: 
 
diff --git a/peslearn/input_processor.py b/peslearn/input_processor.py
index 699f783..2cbe3d0 100644
--- a/peslearn/input_processor.py
+++ b/peslearn/input_processor.py
@@ -38,44 +38,45 @@ def get_keywords(self):
         string_keywords = {'energy': None,                   # parse energies with 'cclib', 'regex', or 'schema'
                            'energy_regex': None,             # a regular expression string, surround by '' or ""
                            'energy_cclib': None,             # a cclib energy option. 'scfenergies', 'mpenergies', 'ccenergies'
+                           'eq_geom'      : None,            #[1.05, 1.15, 104.5] etc
                            'gradient': None,                 # parse gradients with 'cclib', 'regex', or 'schema'
                            'gradient_header': None,          # gradient header regular expression string
                            'gradient_footer': None,          # gradient footer regular expression string
                            'gradient_line': None,            # regular expression string for one line of the cartesian gradient
-                           'hessian':None,                   # parse hessian with 'schema'
+                           'grid_generation' : 'fixed',      # 'uniform' for uniform random drawing, else fixed intervals 
+                           'grid_reduction' : None,          # any int
+                           'gp_ard': 'true',                 # 'true', 'false'. 'opt' treats as hyperparameter
+                           'hessian': None,                  # parse hessian with 'schema'
+                           'hp_maxit': 20,                   # any int
+                           'hp_opt': 'true',                 # 'false'
                            'input_name': 'input.dat',        # what to call new input files generated from template, can be any name
-                           'output_name': 'output.dat',      # the name of electronic structure theory output files corresponding to input_name
+                           'kernel' : None,                  # None, 'verbose', or 'precomputed'
                            'ml_model': 'gp',                 # 'gp', 'nn'
                            'mode': None,                     # 'generate', 'parse', 'learn', or shorthand: 'g', 'p', 'l'
+                           'n_low_energy_train': 0,          # any int
+                           'nas_trial_layers': None,         # List of tuples e.g. [(10,), (10,10,10), (50,50)]
+                           'nn_precision': 32,               # neural network floating point precision 32 or 64
+                           'output_name': 'output.dat',      # the name of electronic structure theory output files corresponding to input_name
                            'pes_name': 'PES.dat',            # any name
                            'pes_dir_name': 'PES_data',       # any name
                            'pes_redundancy': 'false',        # 'true'
                            'pes_format': 'interatomics',     # 'zmat'
+                           'precomputed_kernel' : None,       # dict in quotes of precomputed kernel options
                            'remove_redundancy': 'true',      # 'false'
                            'remember_redundancy' : 'false',  # 'true'
-                           'grid_generation' : 'fixed',      # 'uniform' for uniform random drawing, else fixed intervals 
-                           'grid_reduction' : None,          # any int
-                           'eq_geom'      : None,            #[1.05, 1.15, 104.5] etc
-                           'use_pips': 'true',               #'false'
-                           'sort_pes': 'true',               #'false'
-                           'sampling': 'structure_based',    # 'structure_based','sobol', 'smart_random', 'random', 'energy_ordered'
-                           'n_low_energy_train': 0,          # any int
-                           'training_points': None,          # any int
-                           'validation_points': None,        # any int
-                           'hp_maxit': 20,                   # any int
                            'rseed': None,                    # any int
-                           'gp_ard': 'true',                 # 'true', 'false'. 'opt' treats as hyperparameter
-                           'nas_trial_layers': None,         # List of tuples e.g. [(10,), (10,10,10), (50,50)]
-                           'nn_precision': 32,               # neural network floating point precision 32 or 64
-                           'hp_opt': 'true',                 # 'false'
-                           'schema_generate' : 'false',      # 'true'
-                           'schema_units' : 'angstrom',      # 'bohr'
-                           'schema_method' : None,           # any method interpretable by QC software of choice
+                           'sampling': 'structure_based',    # 'structure_based','sobol', 'smart_random', 'random', 'energy_ordered'
                            'schema_basis' : None,            # any basis interperetable by QC software of choice
                            'schema_driver' : 'energy',       # 'hessian', 'gradient', 'properties'
+                           'schema_generate' : 'false',      # 'true'
                            'schema_keywords' : None,         # any keywords interperetable by QC software of choice, python dictionary in quotes
+                           'schema_method' : None,           # any method interpretable by QC software of choice
                            'schema_prog' : None,             # any program supported by QCEngine
-                           'kernel' : None                   # None or 'verbose' to use only RBF kernel or all possible kernels
+                           'schema_units' : 'angstrom',      # 'bohr'
+                           'sort_pes': 'true',               #'false'
+                           'training_points': None,          # any int
+                           'use_pips': 'true',               #'false'
+                           'validation_points': None        # any int
                             }
 
         for k in string_keywords:

From d4e5366fe98cb1260caa2d7fb531e21820c8ff6d Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Thu, 14 Mar 2024 13:44:13 -0400
Subject: [PATCH 20/28] fixed typos in transform_new_x()

---
 peslearn/ml/kernel_ridge_reg.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/peslearn/ml/kernel_ridge_reg.py b/peslearn/ml/kernel_ridge_reg.py
index cc2ca90..3077898 100644
--- a/peslearn/ml/kernel_ridge_reg.py
+++ b/peslearn/ml/kernel_ridge_reg.py
@@ -321,14 +321,16 @@ def transform_new_X(self, newX, params, Xscaler=None):
         # ensure x dimension is n x m (n new points, m input variables)
         if len(newX.shape) == 1:
             newX = np.expand_dims(newX, 0)
-        elif len(newX) > 2:
+        elif len(newX.shape) > 2:
             raise Exception("Dimensions of input data is incorrect.")
         if params['morse_transform']['morse']:
             newX = morse(newX, params['morse_transform']['morse_alpha'])
         if params['pip']['pip']:
             # find path to fundamental invariants for an N atom subsystem with molecule type AxByCz...
             path = os.path.join(package_directory, "lib", self.molecule_type, "output")
-            newX, degrees = interatomics_to_fundinvar(newX, degrees)
+            newX, degrees = interatomics_to_fundinvar(newX, path)
+            if params['pip']['degree_reduction']:
+                newX = degree_reduce(newX, degrees)
         if Xscaler:
             newX = Xscaler.transform(newX)
         return newX

From 93a4ddcbdc69d358ab6e65d96b9d3cc81769da86 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 10 Apr 2024 15:48:31 -0400
Subject: [PATCH 21/28] added docs

---
 .github/workflows/continuous_integration.yml |  50 +++++
 docs/Makefile                                |  20 ++
 docs/README.md                               |  15 ++
 docs/requirements.yml                        |  26 +++
 docs/source/conf.py                          | 213 +++++++++++++++++++
 docs/source/index.rst                        |   9 +
 peslearn/__init__.py                         |   1 +
 7 files changed, 334 insertions(+)
 create mode 100644 docs/Makefile
 create mode 100644 docs/README.md
 create mode 100644 docs/requirements.yml
 create mode 100644 docs/source/conf.py
 create mode 100644 docs/source/index.rst

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 3e5011b..36a4f90 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -37,3 +37,53 @@ jobs:
       run: |
         conda install pytest
         pytest
+
+  release_sphinx:
+    #needs: [build]
+    defaults:
+      run:
+        shell: bash -l {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        cfg:
+          - conda-env: docs-cf
+            python-version: 3.8
+            label: Sphinx
+            runs-on: ubuntu-latest
+
+    name: "🐍 ${{ matrix.cfg.python-version }} • ${{ matrix.cfg.label }}"
+    runs-on: ${{ matrix.cfg.runs-on }}
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Create Environment
+      uses: conda-incubator/setup-miniconda@v2
+      with:
+        activate-environment: test
+        environment-file: docs/requirements.yml
+        python-version: ${{ matrix.cfg.python-version }}
+        auto-activate-base: false
+        miniforge-variant: Mambaforge
+        use-mamba: true
+        add-pip-as-python-dependency: true
+        channels: conda-forge
+
+    - name: Environment Information
+      run: |
+        mamba info
+        mamba list --show-channel-urls
+
+    - name: Build Documentation
+      run: |
+        python -m pip install . --no-deps
+        cd docs
+        make html
+
+    - name: GitHub Pages Deploy
+      uses: JamesIves/github-pages-deploy-action@4.1.1
+      if: github.event_name == 'push' && github.repository == 'CCQC/PES-Learn' && ( startsWith( github.ref, 'refs/tags/' ) || github.ref == 'refs/heads/master' )
+      with:
+        branch: gh-pages
+        folder: docs/build/html
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..8883c11
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = PESLearn
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..a090ac7
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,15 @@
+# Compiling PESLearn's Documentation
+
+The docs for this project are built with Sphinx. To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are installed.
+
+```
+conda install sphinx sphinx_rtd_theme 
+```
+
+Once installed, you can use the Makefile in this directory to compile static HTML pages by
+
+```
+make html
+```
+
+The compiled docs will be in the _build directory and can be viewed by opening index.html (which may itself be inside a directory called html/ depending on what version of Sphinx is installed).
diff --git a/docs/requirements.yml b/docs/requirements.yml
new file mode 100644
index 0000000..c688140
--- /dev/null
+++ b/docs/requirements.yml
@@ -0,0 +1,26 @@
+name: peslearn-docs
+channels:
+    - nodefaults
+    - conda-forge
+dependencies:
+    - python=3
+    - sphinx
+    - sphinx_rtd_theme
+    - sphinx-automodapi
+    - sphinx-autodoc-typehints
+    - autodoc-pydantic
+    
+      # PESLearn depends
+    - numpy
+    - pydantic >=0.30.1
+    - qcelemental >=0.9.0
+    - numpy >=1.7
+    - GPy >=1.9
+    - scikit-learn >=0.20
+    - pandas >=0.24
+    - hyperopt >=0.1.1
+    - cclib >=1.6
+    - torch >=1.0.1
+    - joblib >=1.3.0
+    - qcelemental >=0.27.1
+    - qcengine >=0.26.0
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..87ada1a
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+import datetime
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath('../..'))
+import peslearn
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'PESLearn'
+copyright = f'{datetime.datetime.today().year}'
+author = 'The PESLearn Development Team'
+
+# The short X.Y version
+version = peslearn.__version__
+# The full version, including alpha/beta/rc tags
+release = peslearn.__version__
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.doctest',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.extlinks',
+    'sphinx.ext.graphviz',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.napoleon',
+    'sphinx_automodapi.automodapi',
+    'sphinx_automodapi.automodsumm',
+    'sphinx_automodapi.smart_resolver',
+    "sphinx_autodoc_typehints",
+    "sphinxcontrib.autodoc_pydantic",
+]
+
+autosummary_generate = True
+automodapi_toctreedirnm = 'api'
+autodoc_typehints = "description"
+napoleon_use_param = True
+napoleon_use_rtype = True
+autodoc_pydantic_model_hide_paramlist = True
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_field_swap_name_and_alias = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path .
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'default'
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PESLearndoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'PESLearn.tex', 'PESLearn Documentation',
+     'The PESLearn Development Team', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'peslearn', 'PESLearn Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PESLearn', 'PESLearn Documentation',
+     author, 'PESLearn', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Extension configuration -------------------------------------------------
+
+extlinks = {
+    'issue': ('https://github.com/CCQC/PES-Learn/issue/%s', 'GH#%s'),
+    'pr': ('https://github.com/CCQC/PES-Learn/pull/%s', 'GH#%s')
+}
+
+# -- Options for intersphinx extension ---------------------------------------
+
+# Example configuration for intersphinx: refer to the Python standard library.
+"""
+intersphinx_mapping = {'python': ('https://docs.python.org/3.10', None),
+                       "numpy": ("https://numpy.org/doc/stable/", None),
+                       'scipy': ('https://docs.scipy.org/doc/scipy/', None),
+                       'matplotlib': ('https://matplotlib.org/stable/', None),
+                       "qcelemental": ("http://docs.qcarchive.molssi.org/projects/QCElemental/en/latest/", None),
+                       "qcportal": ("http://docs.qcarchive.molssi.org/projects/QCPortal/en/latest/", None),
+                       "qcfractal": ("http://docs.qcarchive.molssi.org/projects/QCFractal/en/latest/", None),
+                      }
+"""
+# -- Options for todo extension ----------------------------------------------
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..0b07264
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,9 @@
+.. QCEngine documentation master file, created by
+   sphinx-quickstart on Fri Aug 17 09:45:43 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+=========
+PESLearn
+=========
+
diff --git a/peslearn/__init__.py b/peslearn/__init__.py
index 9eb5a4f..9a6e346 100644
--- a/peslearn/__init__.py
+++ b/peslearn/__init__.py
@@ -6,3 +6,4 @@
 from . import constants 
 
 from .input_processor import InputProcessor
+__version__ = "1.0.0"
\ No newline at end of file

From a581963c7f0c33f029711b15c57e0568960b1343 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 May 2024 12:42:19 -0400
Subject: [PATCH 22/28] Added krr test

---
 tests/test_models.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 016ca17..5f660ff 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -17,21 +17,40 @@
                """)
 input_obj = peslearn.InputProcessor(input_string)
 
+krr_input_string = ("""
+               hp_maxit = 50
+               training_points = 700
+               rseed = 3
+               use_pips = true
+               sampling = structure_based
+               """)
+krr_input_obj = peslearn.InputProcessor(krr_input_string)
+
 def test_gp():
     errors = []
     for i in range(len(datasets)):
         gp = peslearn.ml.GaussianProcess(datasets[i], input_obj, mol_strings[i])
         gp.optimize_model()
         errors.append(gp.test_error)
-    # Test set error < 15 cm-1
-    assert errors[0] < 15
-    assert errors[1] < 15
+        print(gp.test_error)
+    # Test set error < 50 cm-1
+    assert errors[0] < 50
+    assert errors[1] < 50
 
 def test_nn():
     nn = peslearn.ml.NeuralNetwork(datasets[1], input_obj, mol_strings[1])
     nn.optimize_model()
-    # Test set error < 15 cm-1
-    assert nn.test_error < 15
+    # Test set error < 50 cm-1
+    assert nn.test_error < 50
 
+def test_krr():
+    errors = []
+    for i in range(len(datasets)):
+        krr = peslearn.ml.KernelRidgeReg(datasets[i], krr_input_obj, mol_strings[i])
+        krr.optimize_model()
+        errors.append(krr.test_error)
+    # Test set error < 200 cm-1
+    assert errors[0] < 200
+    assert errors[1] < 200
 
 

From 76db45d9546fd05a91d922e3cb586acbf7f79a4f Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 May 2024 12:51:09 -0400
Subject: [PATCH 23/28] updated numpy legacy function

---
 peslearn/ml/data_sampler.py     | 1 +
 peslearn/ml/gaussian_process.py | 2 +-
 peslearn/ml/kernel_ridge_reg.py | 2 +-
 peslearn/ml/neural_network.py   | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/peslearn/ml/data_sampler.py b/peslearn/ml/data_sampler.py
index c5d85d9..66e4eb5 100644
--- a/peslearn/ml/data_sampler.py
+++ b/peslearn/ml/data_sampler.py
@@ -136,6 +136,7 @@ def sobol(self, delta=0.002278):
 
         The Sobol expression is as implemented in Manzhos, Carrington J Chem Phys 145, 2016, and papers they cite.
         """
+        #TODO add option for delta
         # Problems:
         # 1. not easily reproducible with a random seed.
         # 2. Scaling. could in principle improve scaling by doing minibatches in while loop... e.g. test.sample(n=minibatch)
diff --git a/peslearn/ml/gaussian_process.py b/peslearn/ml/gaussian_process.py
index 8c00dc7..685e545 100644
--- a/peslearn/ml/gaussian_process.py
+++ b/peslearn/ml/gaussian_process.py
@@ -157,7 +157,7 @@ def optimize_model(self):
         self.hyperopt_trials = Trials()
         self.itercount = 1  # keep track of hyperopt iterations 
         if self.input_obj.keywords['rseed']:
-            rstate = np.random.RandomState(self.input_obj.keywords['rseed'])
+            rstate = np.random.default_rng(self.input_obj.keywords['rseed'])
         else:
             rstate = None
         best = fmin(self.hyperopt_model,
diff --git a/peslearn/ml/kernel_ridge_reg.py b/peslearn/ml/kernel_ridge_reg.py
index 3077898..f513a6d 100644
--- a/peslearn/ml/kernel_ridge_reg.py
+++ b/peslearn/ml/kernel_ridge_reg.py
@@ -169,7 +169,7 @@ def optimize_model(self):
         self.hyperopt_trials = Trials()
         self.itercount = 1  # keep track of hyperopt iterations 
         if self.input_obj.keywords['rseed']:
-            rstate = np.random.RandomState(self.input_obj.keywords['rseed'])
+            rstate = np.random.default_rng(self.input_obj.keywords['rseed'])
         else:
             rstate = None
         best = fmin(self.hyperopt_model,
diff --git a/peslearn/ml/neural_network.py b/peslearn/ml/neural_network.py
index 44e4d29..0d62cc5 100644
--- a/peslearn/ml/neural_network.py
+++ b/peslearn/ml/neural_network.py
@@ -97,7 +97,7 @@ def optimize_model(self):
         self.hyperopt_trials = Trials()
         self.itercount = 1
         if self.input_obj.keywords['rseed']:
-            rstate = np.random.RandomState(self.input_obj.keywords['rseed'])
+            rstate = np.random.default_rng(self.input_obj.keywords['rseed'])
         else:
             rstate = None
         best = fmin(self.hyperopt_model,

From 6df141c61d7d16bee33d2c2e2f51ae33fb2f0c18 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 May 2024 13:26:58 -0400
Subject: [PATCH 24/28] changed function naming scheme

---
 peslearn/constants.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/peslearn/constants.py b/peslearn/constants.py
index bf79e84..b3e3a11 100644
--- a/peslearn/constants.py
+++ b/peslearn/constants.py
@@ -14,9 +14,9 @@
 
 # Gaussian process convenience function writer 
 gp_convenience_function = """
-# How to use 'compute_energy()' function
+# How to use 'pes()' function
 # --------------------------------------
-# E = compute_energy(geom_vectors, cartesian=bool)
+# E = pes(geom_vectors, cartesian=bool)
 # 'geom_vectors' is either: 
 #  1. A list or tuple of coordinates for a single geometry. 
 #  2. A column vector of one or more sets of 1d coordinate vectors as a list of lists or 2D NumPy array:
@@ -70,9 +70,9 @@ def cart1d_to_distances1d(vec):
 
 
 nn_convenience_function = """
-# How to use 'compute_energy()' function
+# How to use 'pes()' function
 # --------------------------------------
-# E = compute_energy(geom_vectors, cartesian=bool)
+# E = pes(geom_vectors, cartesian=bool)
 # 'geom_vectors' is either: 
 #  1. A list or tuple of coordinates for a single geometry. 
 #  2. A column vector of one or more sets of 1d coordinate vectors as a list of lists or 2D NumPy array:
@@ -127,9 +127,9 @@ def cart1d_to_distances1d(vec):
 """    
 
 krr_convenience_funciton = """
-# How to use 'compute_energy()' function
+# How to use 'pes()' function
 # --------------------------------------
-# E = compute_energy(geom_vectors, cartesian=bool)
+# E = pes(geom_vectors, cartesian=bool)
 # 'geom_vectors' is either: 
 #  1. A list or tuple of coordinates for a single geometry. 
 #  2. A column vector of one or more sets of 1d coordinate vectors as a list of lists or 2D NumPy array:
@@ -165,7 +165,7 @@ def pes(geom_vectors, cartesian=True):
         g = np.apply_along_axis(cart1d_to_distances1d, axis, g)
     newX = krr.transform_new_X(g, params, Xscaler)
     E = model.predict(newX)
-    e = nn.inverse_transform_new_y(E, yscaler)
+    e = krr.inverse_transform_new_y(E, yscaler)
     #e = e - (insert min energy here)
     #e *= 219474.63  ( convert units )
     return e

From ef10dd1a1522a098b061fd788b2260abc5256c1f Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Wed, 1 May 2024 13:27:12 -0400
Subject: [PATCH 25/28] updated author info and version

---
 setup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 49adff4..0f4265c 100644
--- a/setup.py
+++ b/setup.py
@@ -3,10 +3,9 @@
 if __name__ == "__main__":
     setuptools.setup(
         name='peslearn',
-        version="0.1.1",
+        version="1.0.0",
         description='Automated Construction of Machine Learning Models of Molecular Potential Energy Surfaces.',
-        author='Adam Abbott',
-        author_email='adabbott@uga.edu',
+        author='Adam Abbott, Ian Beck',
         url="https://github.com/CCQC/PES-Learn",
         license='BSD-3C',
         packages=setuptools.find_packages(),

From 1a950b47fbec557f5fd8b36e893aea5a8389bf94 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Fri, 3 May 2024 10:36:15 -0400
Subject: [PATCH 26/28] github continuous integration fixes

---
 .github/workflows/continuous_integration.yml |  6 ++--
 docs/make.bat                                | 35 ++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 docs/make.bat

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 36a4f90..2ef37c7 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -13,10 +13,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v2
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
@@ -24,7 +24,7 @@ jobs:
     - name: Install dependencies
       shell: bash -l {0}
       run: |
-        conda install python=3.7
+        conda install python=3.8
         conda install numpy
         conda install -c conda-forge gpy
         conda install pytorch
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..747ffb7
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd

From f03baec72bcd51d110de99949b2125a087832bf4 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Fri, 3 May 2024 10:47:47 -0400
Subject: [PATCH 27/28] more continuous integration fixes

---
 .github/workflows/continuous_integration.yml | 1 +
 tests/test_models.py                         | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 2ef37c7..1529d1d 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -66,6 +66,7 @@ jobs:
         python-version: ${{ matrix.cfg.python-version }}
         auto-activate-base: false
         miniforge-variant: Mambaforge
+        miniforge-version: latest
         use-mamba: true
         add-pip-as-python-dependency: true
         channels: conda-forge
diff --git a/tests/test_models.py b/tests/test_models.py
index 5f660ff..ff7aedb 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -32,7 +32,6 @@ def test_gp():
         gp = peslearn.ml.GaussianProcess(datasets[i], input_obj, mol_strings[i])
         gp.optimize_model()
         errors.append(gp.test_error)
-        print(gp.test_error)
     # Test set error < 50 cm-1
     assert errors[0] < 50
     assert errors[1] < 50

From 85edb8abfc16093f8601923c395de62c7660f921 Mon Sep 17 00:00:00 2001
From: iantbeck <ian.beck888@gmail.com>
Date: Fri, 3 May 2024 10:50:55 -0400
Subject: [PATCH 28/28] stage docs

---
 docs/Makefile         |   8 +-
 docs/README.md        |  15 ---
 docs/requirements.yml |   2 +-
 docs/source/conf.py   | 207 +++---------------------------------------
 docs/source/index.rst |  29 +++++-
 5 files changed, 40 insertions(+), 221 deletions(-)
 delete mode 100644 docs/README.md

diff --git a/docs/Makefile b/docs/Makefile
index 8883c11..d0c3cbf 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,10 +1,10 @@
 # Minimal makefile for Sphinx documentation
 #
 
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SPHINXPROJ    = PESLearn
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index a090ac7..0000000
--- a/docs/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Compiling PESLearn's Documentation
-
-The docs for this project are built with Sphinx. To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are installed.
-
-```
-conda install sphinx sphinx_rtd_theme 
-```
-
-Once installed, you can use the Makefile in this directory to compile static HTML pages by
-
-```
-make html
-```
-
-The compiled docs will be in the _build directory and can be viewed by opening index.html (which may itself be inside a directory called html/ depending on what version of Sphinx is installed).
diff --git a/docs/requirements.yml b/docs/requirements.yml
index c688140..24f48e0 100644
--- a/docs/requirements.yml
+++ b/docs/requirements.yml
@@ -3,7 +3,7 @@ channels:
     - nodefaults
     - conda-forge
 dependencies:
-    - python=3
+    - python=3.8
     - sphinx
     - sphinx_rtd_theme
     - sphinx-automodapi
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 87ada1a..e7aed5f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,213 +1,28 @@
-# -*- coding: utf-8 -*-
-#
 # Configuration file for the Sphinx documentation builder.
 #
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-
-import datetime
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath('../..'))
-import peslearn
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
 
 # -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'PESLearn'
-copyright = f'{datetime.datetime.today().year}'
-author = 'The PESLearn Development Team'
-
-# The short X.Y version
-version = peslearn.__version__
-# The full version, including alpha/beta/rc tags
-release = peslearn.__version__
-
+project = 'PES-Learn'
+copyright = '2024, PES-Learn Development Team'
+author = 'PES-Learn Development Team'
+release = '1.0.0'
 
 # -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.doctest',
-    'sphinx.ext.todo',
-    'sphinx.ext.coverage',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.extlinks',
-    'sphinx.ext.graphviz',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.napoleon',
-    'sphinx_automodapi.automodapi',
-    'sphinx_automodapi.automodsumm',
-    'sphinx_automodapi.smart_resolver',
-    "sphinx_autodoc_typehints",
-    "sphinxcontrib.autodoc_pydantic",
-]
-
-autosummary_generate = True
-automodapi_toctreedirnm = 'api'
-autodoc_typehints = "description"
-napoleon_use_param = True
-napoleon_use_rtype = True
-autodoc_pydantic_model_hide_paramlist = True
-autodoc_pydantic_model_show_config_summary = False
-autodoc_pydantic_field_swap_name_and_alias = True
+extensions = []
 
-# Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
-
-# The master toctree document.
-master_doc = 'index'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = "en"
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path .
 exclude_patterns = []
 
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'default'
 
 
 # -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_rtd_theme'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
+html_theme = 'alabaster'
 html_static_path = ['_static']
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'PESLearndoc'
-
-
-# -- Options for LaTeX output ------------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'PESLearn.tex', 'PESLearn Documentation',
-     'The PESLearn Development Team', 'manual'),
-]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'peslearn', 'PESLearn Documentation',
-     [author], 1)
-]
-
-
-# -- Options for Texinfo output ----------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'PESLearn', 'PESLearn Documentation',
-     author, 'PESLearn', 'One line description of project.',
-     'Miscellaneous'),
-]
-
-
-# -- Extension configuration -------------------------------------------------
-
-extlinks = {
-    'issue': ('https://github.com/CCQC/PES-Learn/issue/%s', 'GH#%s'),
-    'pr': ('https://github.com/CCQC/PES-Learn/pull/%s', 'GH#%s')
-}
-
-# -- Options for intersphinx extension ---------------------------------------
-
-# Example configuration for intersphinx: refer to the Python standard library.
-"""
-intersphinx_mapping = {'python': ('https://docs.python.org/3.10', None),
-                       "numpy": ("https://numpy.org/doc/stable/", None),
-                       'scipy': ('https://docs.scipy.org/doc/scipy/', None),
-                       'matplotlib': ('https://matplotlib.org/stable/', None),
-                       "qcelemental": ("http://docs.qcarchive.molssi.org/projects/QCElemental/en/latest/", None),
-                       "qcportal": ("http://docs.qcarchive.molssi.org/projects/QCPortal/en/latest/", None),
-                       "qcfractal": ("http://docs.qcarchive.molssi.org/projects/QCFractal/en/latest/", None),
-                      }
-"""
-# -- Options for todo extension ----------------------------------------------
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = True
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0b07264..37cb0de 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,9 +1,28 @@
-.. QCEngine documentation master file, created by
-   sphinx-quickstart on Fri Aug 17 09:45:43 2018.
+.. PES-Learn documentation master file, created by
+   sphinx-quickstart on Thu May  2 11:10:40 2024.
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-=========
-PESLearn
-=========
+Welcome to PES-Learn's documentation!
+=====================================
 
+**PES-Learn** is  a Python library designed to fit system-specific Born-Oppenheimer 
+potential energy surfaces using modern machine learning models. PES-Learn assists in 
+generating datasets, and features Gaussian process, neural network, and kernel ridge regression
+model optimization routines. The goal is to provide high-performance models for a given dataset 
+*without* requiring user expertise in machine learning. 
+
+This project is under active development and welcomes community suggestions and contributions.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`