Introduce multi fidelity learning (#105)

* Use parallelism to get correct number of blocks See Parsl/parsl#1647 * Compute the solvation energy too Also save to gzipped files. The size is starting to be notable * Implement a multi-fidelity MPNN Uses delta learning to predict the properties at intermediate levels * No longer test for errors being thrown * Only output the highest level of fidelity Also some flake8 fixes * Document how multi-fidelity trainign works * Minor changes to the documentation * Use a more robust relaxation technique (#108) * Use MDMin to reduce to 0.1 eV/Ang, then BFGS Still want to test this before we merge to main, but fixes #106 * Use FIRE and a higher threshold for switching * Use molecule which takes longer to optimize in test * Use isnan and not isinf for detecting placeholders * Switch to one scale layer per network * Compute diff between adjance level, not from first * Also fix how we compute inference Delta between adjacent levels, not the beginning Changed our test routine to ensure we do this right * Initial training runs for multi-fidelity learning * Update data loader test to handle new fixtures * Train using subset of available data, test on all fidelities * Minor bug: start decaying LR immediately
exalearn · Sep 21, 2023 · cba0d06 · cba0d06
1 parent 8f98d6b
commit cba0d06
Show file tree

Hide file tree

Showing 15 changed files with 1,180 additions and 81 deletions.
diff --git a/docs/components/score.rst b/docs/components/score.rst
@@ -66,3 +66,22 @@ copy of the model:
 
     update_msg = scorer.retrain(model_msg, inputs, outputs)  # Can be run remotely
     model = scorer.update(model, update_msg)
+
+Multi-fidelity Learning
+-----------------------
+
+Some Scorer classes support using properties computed at lower levels of accuracy
+to improve performance.
+The strategies employed by each Scorer may be different, but all have the same interface.
+
+Use the multi-fidelity capability of a Scorer by providing multiple recipes when preprocessing
+for *both* inputs and outputs for training or inference.
+The recipes must be ordered from lowest- to highest-fidelity.
+
+.. code-block:: python
+
+    outputs = model.transform_outputs(records, [recipe_low, recipe_high])
+    inputs = model.transform_inputs(records, [recipe_low, recipe_high])
+
+The outputs will, by default, contain the recipe computed at each level of fidelity
+with ``np.nan`` values for missing data.
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -44,7 +44,8 @@ designed to run on OS X. It will not run all features (e.g., xTB computations an
 but is enough to test many features
 
 .. code-block:: shell
-    conda env create --file envs/environment-cpu.yaml
+
+    conda env create --file envs/environment-macos.yaml
 
 Modifying an Installation
 -------------------------

diff --git a/examol/reporting/database.py b/examol/reporting/database.py
@@ -11,7 +11,8 @@ class DatabaseWriter(BaseReporter):
     """Writes the current database to disk as a JSON file"""
 
     def report(self, thinker: MoleculeThinker):
+        temp_database = thinker.database.copy()
         with open(thinker.run_dir / 'database.json', 'w') as fp:
-            for record in thinker.database.values():
+            for record in temp_database.values():
                 print(record.to_json(), file=fp)
         logger.info(f'Saved {len(thinker.database)} records to disk')
diff --git a/examol/score/base.py b/examol/score/base.py
@@ -1,12 +1,28 @@
 """Base classes for scoring functions"""
 from dataclasses import dataclass
+from typing import Sequence
 
 import numpy as np
 
 from examol.store.models import MoleculeRecord
 from examol.store.recipes import PropertyRecipe
 
 
+def collect_outputs(records: list[MoleculeRecord], recipes: list[PropertyRecipe]) -> np.ndarray:
+    """Collect the outputs for several recipe for each molecule
+
+    Args:
+        records: Molecule records to be summarized
+        recipes: List of recipes to include
+    Returns:
+        Matrix where each row is a different molecule, and each column is a different recipe
+    """
+    return np.array([
+        [record.properties.get(recipe.name, {}).get(recipe.level, np.nan) for recipe in recipes]
+        for record in records
+    ])
+
+
 @dataclass
 class Scorer:
     """Base class for algorithms which quickly assign a score to a molecule, typically using a machine learning model
@@ -40,32 +56,58 @@ class Scorer:
         outputs = scorer.transform_outputs(records, recipe)  # Prepares label for a specific recipe
         update_msg = scorer.retrain(model_msg, inputs, outputs)  # Run remotely
         model = scorer.update(model, update_msg)
+
+    **Multi-fidelity scoring**
+
+    Multi-fidelity learning methods employ lower-fidelity estimates of a target value to improve the prediction of that value.
+    ExaMol supports multi-fidelity through the ability to provide more than one recipe as inputs to
+    :meth:`transform_inputs` and :meth:`transform_outputs`.
+
+    Implementations of Scorers must be designed to support multi-fidelity learning.
     """
 
-    def transform_inputs(self, record_batch: list[MoleculeRecord]) -> list:
+    _supports_multi_fidelity: bool = False
+    """Whether the class supports multi-fidelity optimization"""
+
+    def transform_inputs(self, record_batch: list[MoleculeRecord], recipes: Sequence[PropertyRecipe] | None = None) -> list:
         """Form inputs for the model based on the data in a molecule record
 
         Args:
             record_batch: List of records to pre-process
+            recipes: List of recipes ordered from lowest to highest fidelity.
+                Only used in multi-fidelity scoring algorithms
         Returns:
             List of inputs ready for :meth:`score` or :meth:`retrain`
         """
         raise NotImplementedError()
 
-    def transform_outputs(self, records: list[MoleculeRecord], recipe: PropertyRecipe) -> np.ndarray:
+    # TODO (wardlt): I'm not super-happy with multi-fidelity being inferred from input types. What if we want multi-objective learning
+    def transform_outputs(self, records: list[MoleculeRecord], recipe: PropertyRecipe | Sequence[PropertyRecipe]) -> np.ndarray:
         """Gather the target outputs of the model
 
         Args:
             records: List of records from which to extract outputs
-            recipe: Target recipe for the scorer
+            recipe: Target recipe for the scorer for single-fidelity learning
+                or a list of recipes ordered from lowest to highest fidelity
+                for multi-objective learning.
         Returns:
             Outputs ready for model training
         """
-        for record in records:
-            if recipe.name not in record.properties or recipe.level not in record.properties[recipe.name]:
-                raise ValueError(f'Record for {record.identifier.smiles} missing property {recipe.name} at level {recipe.level}')
-
-        return np.array([x.properties[recipe.name][recipe.level] for x in records])
+        # Determine if we are doing single or multi-fidelity learning
+        is_single = False
+        if isinstance(recipe, PropertyRecipe):
+            is_single = True
+            recipes = [recipe]
+        else:
+            if not self._supports_multi_fidelity:  # pragma: no-coverage
+                raise ValueError(f'{self.__class__.__name__} does not support multi-fidelity training')
+            recipes = recipe
+
+        # Gather the outputs
+        output = collect_outputs(records, recipes)
+        if is_single:
+            return output[:, 0]
+        return output
 
     def prepare_message(self, model: object, training: bool = False) -> object:
         """Get the model state as a serializable object

diff --git a/examol/score/nfp.py b/examol/score/nfp.py
@@ -1,6 +1,7 @@
 """Train neural network models using `NFP <https://github.com/NREL/nfp>`_"""
 
 from sklearn.model_selection import train_test_split
+
 try:
     from tensorflow.keras import callbacks as cb
 except ImportError as e:  # pragma: no-coverage
@@ -11,7 +12,8 @@
 
 from examol.store.models import MoleculeRecord
 from examol.utils.conversions import convert_string_to_nx
-from .base import Scorer
+from examol.store.recipes import PropertyRecipe
+from .base import Scorer, collect_outputs
 from .utils.tf import LRLogger, TimeLimitCallback, EpochTimeLogger
 
 
@@ -53,16 +55,26 @@ def make_simple_network(
         output_layers: list[int] = (512, 256, 128),
         reduce_op: str = 'mean',
         atomwise: bool = True,
+        outputs: int = 1,
 ) -> tf.keras.models.Model:
-    """Construct a Keras model using the settings provided by a user
+    """Construct a basic MPNN model using the settings provided by a user
+
+    Models will have embeddings for atoms with atomic numbers up to 63,
+    and 4 types of bonds (single, double, triple, aromatic).
+
+    The models use edge, node, and global update for each message passing layer
+    and a separate set of MLPs for each of the outputs.
+    There is also a "scaling" layer which can be used to adjust the mean
+    and standard deviation of the prediction.
 
     Args:
         atom_features: Number of features used per atom and bond
         message_steps: Number of message passing steps
         output_layers: Number of neurons in the readout layers
         reduce_op: Operation used to reduce from atom-level to molecule-level vectors
-        atomwise: Whether to reduce atomwise contributions to form an output,
+        atomwise: Whether to reduce atomwise contributions after the output layers,
                   or reduce to a single vector per molecule before the output layers
+        outputs: Number of output properties. Each will use their own output network
     Returns:
         A model instantiated with the user-defined options
     """
@@ -94,16 +106,34 @@ def make_simple_network(
         )
         global_state = tf.keras.layers.Add()([global_state, new_global_state])
 
-    # Pass the global state through an output
-    output = atom_state
+    # Condense the features into a single vector if building a molecular fingerprint
     if not atomwise:
-        output = ReduceAtoms(reduce_op)(output)
-    for shape in output_layers:
-        output = tf.keras.layers.Dense(shape, activation='relu')(output)
-    output = tf.keras.layers.Dense(1)(output)
-    if atomwise:
-        output = ReduceAtoms(reduce_op)(output)
-    output = tf.keras.layers.Dense(1, activation='linear', name='scale')(output)
+        start_state = ReduceAtoms(reduce_op)(atom_state)
+    else:
+        start_state = atom_state
+
+    # Build the output layers
+    output_networks = []
+    for output_id in range(outputs):
+        # Build the MLP
+        output = start_state
+        for i, shape in enumerate(output_layers):
+            output = tf.keras.layers.Dense(shape, activation='relu', name=f'output_{output_id}_layer_{i}')(output)
+        output = tf.keras.layers.Dense(1)(output)
+
+        # Reduce to a single prediction per network if needed
+        if atomwise:
+            output = ReduceAtoms(reduce_op)(output)
+
+        # Apply a scale layer then append to the outputs
+        output = tf.keras.layers.Dense(1, activation='linear', name=f'scale_{output_id}')(output)
+        output_networks.append(output)
+
+    # Combine them if needed
+    if len(output_networks) == 1:
+        output = output_networks[0]
+    else:
+        output = tf.keras.layers.Concatenate(axis=-1)(output_networks)
 
     # Construct the tf.keras model
     return tf.keras.Model([atom, bond, connectivity], [output])
@@ -261,7 +291,17 @@ def generator():
 class NFPScorer(Scorer):
     """Train message-passing neural networks based on the `NFP <https://github.com/NREL/nfp>`_ library.
 
-    NFP uses Keras to define message-passing networks, which is backed by Tensorflow for executing the networks on different hardware."""
+    NFP uses Keras to define message-passing networks, which is backed by Tensorflow for executing the networks on different hardware.
+
+    Multi-fidelity models predict the lowest, most-plentiful level of fidelity directly and
+    correction factors to adjust the low-level predictions for the higher levels (i.e., delta learning).
+    Training does not require all levels of fidelity to be available and will only measure loss
+    against the available data.
+    Inference predicts the low-fidelity value and all correction factors for higher levels,
+    but uses known values in place of them if available.
+    """
+
+    _supports_multi_fidelity = True
 
     def __init__(self, retrain_from_scratch: bool = True):
         """
@@ -276,10 +316,17 @@ def prepare_message(self, model: tf.keras.models.Model, training: bool = False)
         else:
             return NFPMessage(model)
 
-    def transform_inputs(self, record_batch: list[MoleculeRecord]) -> list:
-        return [convert_string_to_dict(record.identifier.inchi) for record in record_batch]
+    def transform_inputs(self, record_batch: list[MoleculeRecord], recipes: list[PropertyRecipe] | None = None) -> list[dict | tuple[dict, np.ndarray]]:
+        mol_dicts = [convert_string_to_dict(record.identifier.inchi) for record in record_batch]
+
+        # Return only the molecular dicts for single-fidelity runs
+        if recipes is None:
+            return mol_dicts
 
-    def score(self, model_msg: NFPMessage, inputs: list[dict], batch_size: int = 64, **kwargs) -> np.ndarray:
+        # Return both the molecular dictionary and known properties for multi-fidelity
+        return list(zip(mol_dicts, collect_outputs(record_batch, recipes)))
+
+    def score(self, model_msg: NFPMessage, inputs: list[dict | tuple[dict, np.ndarray]], batch_size: int = 64, **kwargs) -> np.ndarray:
         """Assign a score to molecules
 
         Args:
@@ -290,8 +337,25 @@ def score(self, model_msg: NFPMessage, inputs: list[dict], batch_size: int = 64,
             The scores to a set of records
         """
         model = model_msg.get_model()  # Unpack the model
+
+        # Unpack the known values if running multiobjective learning
+        is_single = isinstance(inputs[0], dict)
+        known_outputs = None
+        if not is_single:
+            inputs, known_outputs = zip(*inputs)
+            known_outputs = np.array(known_outputs)
+            known_outputs[:, 1:] = np.diff(known_outputs)
+
+        # Run inference
         loader = make_data_loader(inputs, batch_size=batch_size)
-        return model.predict(loader, verbose=False)
+        ml_outputs = model.predict(loader, verbose=False)
+        if is_single:
+            return ml_outputs
+
+        # For multi-objective, add in the use the known outputs in place of the NN outputs
+        best_outputs = np.where(np.isnan(known_outputs), ml_outputs, known_outputs)
+        best_outputs = best_outputs.cumsum(axis=1)  # The outputs of the networks are deltas
+        return best_outputs[:, -1]  # Return only the highest level of fidelity
 
     def retrain(self,
                 model_msg: dict | NFPMessage,
@@ -330,24 +394,48 @@ def retrain(self,
             model = model_msg.get_model()
         elif isinstance(model_msg, dict):
             model = tf.keras.Model.from_config(model_msg, custom_objects=custom_objects)
-        else:
+        else:  # pragma: no-coverage
             raise NotImplementedError(f'Unrecognized message type: {type(model_msg)}')
 
+        # Prepare data for single- vs multi-objective
+        is_single = isinstance(inputs[0], dict)
+        if is_single:
+            # Nothing special: Use a standard loss function, no preprocessing required
+            loss = 'mean_squared_error'
+            value_spec = tf.TensorSpec((), dtype=tf.float32)
+        else:
+            # Use a loss function which ignores the NaN values
+            def loss(y_true, y_pred):
+                """Measure loss only on the non NaN values"""
+                is_known = tf.math.is_finite(y_true)
+                return tf.keras.losses.mean_squared_error(y_true[is_known], y_pred[is_known])
+
+            inputs, _ = zip(*inputs)  # We do not need the input values for training
+
+            # Prepare the outputs
+            outputs = outputs.copy()
+            outputs[:, 1:] = np.diff(outputs)  # Compute the deltas between successive stages
+            value_spec = tf.TensorSpec((outputs.shape[1],), dtype=tf.float32)
+
         # Split off a validation set
         train_x, valid_x, train_y, valid_y = train_test_split(inputs, outputs, test_size=validation_split)
 
         # Make the loaders
         steps_per_epoch = len(train_x) // batch_size
-        train_loader = make_data_loader(train_x, train_y, repeat=True, batch_size=batch_size, drop_last_batch=True, shuffle_buffer=32768)
+        train_loader = make_data_loader(train_x, train_y, repeat=True, batch_size=batch_size, drop_last_batch=True, shuffle_buffer=32768, value_spec=value_spec)
         valid_steps = len(valid_x) // batch_size
-        assert valid_steps > 0, 'We need some validation data'
-        valid_loader = make_data_loader(valid_x, valid_y, batch_size=batch_size, drop_last_batch=True)
+        if valid_steps == 0:  # pragma: no-coverage
+            raise ValueError(f'Insufficient validation data. Need at least {batch_size} records')
+        valid_loader = make_data_loader(valid_x, valid_y, batch_size=batch_size, drop_last_batch=True, value_spec=value_spec)
 
         # Define initial guesses for the "scaling" later
         try:
-            scale_layer = model.get_layer('scale')
             outputs = np.array(outputs)
-            scale_layer.set_weights([outputs.std()[None, None], outputs.mean()[None]])
+            output_mean = np.nanmean(outputs, axis=0)
+            outputs_std = np.clip(np.nanstd(outputs, axis=0), 1e-6, a_max=None)
+            for i, (m, s) in enumerate(zip(np.atleast_1d(output_mean), np.atleast_1d(outputs_std))):
+                scale_layer = model.get_layer(f'scale_{i}')
+                scale_layer.set_weights([np.atleast_2d(s), np.atleast_1d(m)])
         except ValueError:
             pass
 
@@ -357,13 +445,14 @@ def retrain(self,
         decay_rate = (final_learn_rate / init_learn_rate) ** (1. / (num_epochs - 1))
 
         def lr_schedule(epoch, lr):
-            return lr * decay_rate
+            if epoch > 0:
+                return lr * decay_rate
+            return lr
 
         # Compile the model then train
         model.compile(
             tf.optimizers.Adam(init_learn_rate),
-            'mean_squared_error',
-            metrics=['mean_absolute_error'],
+            loss=loss,
             steps_per_execution=steps_per_exec,
         )