diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml index 504eb1c..443780f 100644 --- a/.github/workflows/gh-pages.yml +++ b/.github/workflows/gh-pages.yml @@ -14,6 +14,7 @@ jobs: - uses: actions/setup-python@v3 - name: Install dependencies run: | + pip install -e . pip install -r docs/requirements.txt - name: Sphinx build run: | diff --git a/README.md b/README.md index bef4c42..160e63a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,12 @@ # ExaMol [![CI](https://github.com/exalearn/ExaMol/actions/workflows/python-app.yml/badge.svg)](https://github.com/exalearn/ExaMol/actions/workflows/python-app.yml) +[![Deploy Docs](https://github.com/exalearn/ExaMol/actions/workflows/gh-pages.yml/badge.svg)](https://exalearn.github.io/ExaMol/) [![Coverage Status](https://coveralls.io/repos/github/exalearn/ExaMol/badge.svg?branch=main)](https://coveralls.io/github/exalearn/ExaMol?branch=main) -Designing new molecules as fast as possible with AI and simulation +Designing new molecules as fast as possible with AI and simulation. + +- Documentation: [exalearn.github.io/ExaMol/](https://exalearn.github.io/ExaMol/) +- Source Code: [github.com/exalearn/ExaMol](https://github.com/exalearn/ExaMol) ## Installation diff --git a/docs/api/examol.rst b/docs/api/examol.rst new file mode 100644 index 0000000..0b2c943 --- /dev/null +++ b/docs/api/examol.rst @@ -0,0 +1,13 @@ +API Documentation +================= + +.. toctree:: + :maxdepth: 2 + + examol.score + examol.select + examol.simulate + examol.specify + examol.steer + examol.store + examol.utils diff --git a/docs/api/examol.score.rst b/docs/api/examol.score.rst new file mode 100644 index 0000000..4798153 --- /dev/null +++ b/docs/api/examol.score.rst @@ -0,0 +1,30 @@ +examol.score +============ + +.. automodule:: examol.score + :members: + :show-inheritance: + +examol.score.base +----------------- + +.. automodule:: examol.score.base + :members: + :undoc-members: + :show-inheritance: + +examol.score.nfp +---------------- + +.. automodule:: examol.score.nfp + :members: + :undoc-members: + :show-inheritance: + +examol.score.rdkit +------------------ + +.. automodule:: examol.score.rdkit + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/examol.select.rst b/docs/api/examol.select.rst new file mode 100644 index 0000000..92bb2ed --- /dev/null +++ b/docs/api/examol.select.rst @@ -0,0 +1,22 @@ +examol.select +============= + +.. automodule:: examol.select + :members: + :show-inheritance: + +examol.select.base +------------------ + +.. automodule:: examol.select.base + :members: + :undoc-members: + :show-inheritance: + +examol.select.baseline +---------------------- + +.. automodule:: examol.select.baseline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/examol.simulate.rst b/docs/api/examol.simulate.rst new file mode 100644 index 0000000..b66f166 --- /dev/null +++ b/docs/api/examol.simulate.rst @@ -0,0 +1,42 @@ +examol.simulate +=============== + +.. automodule:: examol.simulate + :members: + :show-inheritance: + +examol.simulate.base +-------------------- + +.. automodule:: examol.simulate.base + :members: + :undoc-members: + :show-inheritance: + +examol.simulate.initialize +-------------------------- + +.. automodule:: examol.simulate.initialize + :members: + :undoc-members: + :show-inheritance: + +examol.simulate.ase +------------------- + +.. automodule:: examol.simulate.ase + :members: + :undoc-members: + :show-inheritance: + + +examol.simulate.ase.utils +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: examol.simulate.ase.utils + :members: + :undoc-members: + :show-inheritance: + + + diff --git a/docs/api/examol.specify.rst b/docs/api/examol.specify.rst new file mode 100644 index 0000000..e51f99b --- /dev/null +++ b/docs/api/examol.specify.rst @@ -0,0 +1,7 @@ +examol.specify +============== + +.. automodule:: examol.specify + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/examol.steer.rst b/docs/api/examol.steer.rst new file mode 100644 index 0000000..b00b325 --- /dev/null +++ b/docs/api/examol.steer.rst @@ -0,0 +1,22 @@ +examol.steer +============ + +.. automodule:: examol.steer + :members: + :show-inheritance: + +examol.steer.base +----------------- + +.. automodule:: examol.steer.base + :members: + :undoc-members: + :show-inheritance: + +examol.steer.single +------------------- + +.. automodule:: examol.steer.single + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/examol.store.rst b/docs/api/examol.store.rst new file mode 100644 index 0000000..b7b8932 --- /dev/null +++ b/docs/api/examol.store.rst @@ -0,0 +1,21 @@ +examol.store +============ + +.. automodule:: examol.store + :members: + :show-inheritance: + +examol.store.models +------------------- + +.. automodule:: examol.store.models + :members: + :undoc-members: + :show-inheritance: + +examol.store.recipes +-------------------- + +.. automodule:: examol.store.recipes + :members: + :show-inheritance: diff --git a/docs/api/examol.utils.rst b/docs/api/examol.utils.rst new file mode 100644 index 0000000..682a3e6 --- /dev/null +++ b/docs/api/examol.utils.rst @@ -0,0 +1,22 @@ +examol.utils +============ + +.. automodule:: examol.utils + :members: + :show-inheritance: + +examol.utils.chemistry +---------------------- + +.. automodule:: examol.utils.chemistry + :members: + :undoc-members: + :show-inheritance: + +examol.utils.conversions +------------------------ + +.. automodule:: examol.utils.conversions + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/components/index.rst b/docs/components/index.rst new file mode 100644 index 0000000..e122acc --- /dev/null +++ b/docs/components/index.rst @@ -0,0 +1,13 @@ +Components +========== + +The ExaMol library is built around components each dedicated to different aspects of a design application. + +.. toctree:: + :maxdepth: 1 + + score + select + simulate + steer + store \ No newline at end of file diff --git a/docs/components/score.rst b/docs/components/score.rst new file mode 100644 index 0000000..4011305 --- /dev/null +++ b/docs/components/score.rst @@ -0,0 +1,65 @@ +Score +===== + +The Score module defines interfaces for running machine learning (ML) tasks on distributed systems. +Each implementation of :class:`~examol.score.base.Scorer` provides tools for sending models to +remote compute nodes, +preparing molecular data for training or inference, +and functions for executing training and inference on remote nodes. + +Available Interfaces +-------------------- + +ExaMol provides interfaces to several libraries which support ML on molecular property data. + +.. list-table:: + :header-rows: 1 + + * - Interface + - Model Types + - Description + * - :class:`~examol.score.rdkit.RDKitScorer` + - Conventional ML + - Models which use fingerprints computed from RDKit as inputs to scikit-learn Pipelines. + * - :class:`~examol.score.nfp.NFPScorer` + - MPNNs + - Neural networks based on the `Neural Fingerprints (nfp) library `_, + which is backed by Tensorflow + +Using Scorers +------------- + +Scorers separate pre-processing data, transmitting models, and running ML tasks into separate steps +so that they can be distributed across supercomputing resources. + +Consider model training as an example. +Start by creating a scorer, a model it will train, and the recipe describing the computations to be learned. + +.. code-block:: python + + scorer = RDKitScorer() + recipe = RedoxEnergy(charge=1, config_name='xtb') + model = make_knn_model() + +Training the model requires first transforming the available molecule data +(as `molecule data records `_) +into inputs and outputs compatible with the scorer. + +.. code-block:: python + + outputs = model.transform_outputs(records, recipe) # Outputs are specific to a recipe + inputs = model.transform_inputs(records) # Inputs are not + +Then, convert the model into a form that can be transmitted across nodes + +.. code-block:: python + + model_msg = model.prepare_message(model, training=True) + +ExaMol is now ready to run training on a remote node, and will use the output of training to update the local +copy of the model: + +.. code-block:: python + + update_msg = scorer.retrain(model_msg, inputs, outputs) # Can be run remotely + model = scorer.update(model, update_msg) diff --git a/docs/components/select.rst b/docs/components/select.rst new file mode 100644 index 0000000..2c7f418 --- /dev/null +++ b/docs/components/select.rst @@ -0,0 +1,55 @@ +Select +====== + +The Select model define adaptive experimental design algorithms +that select the next computations based on predictions from the +`machine learning models `_. + +Available Selectors +------------------- + +ExaMol includes selectors that have a variety of characteristics + +.. list-table:: + :header-rows: 1 + + * - Selector + - Category + - Batch Aware + - Multi-Fidelity + * - :class:`~examol.select.baseline.RandomSelector` + - Baseline + - ✘ + - ✘ + * - :class:`~examol.select.baseline.GreedySelector` + - Baseline + - ✘ + - ✘ + +Using a Selector +---------------- + +Selectors employ a batching strategy to work with very large search spaces. + +Start the selection process by creating the Selector then signaling that it should prepare to receive batches. + +.. code-block:: python + + selector = GreedySelector(to_select=2, maximize=True) + selector.start_gathering() + +The Selector can then receive new predictions as a list of "keys" that define which computation +associated with a list of of predictions from a machine learning model. + + +.. code-block:: python + + selector.add_possibilities(keys=[1, 2, 3], samples=np.array([[1, 2, 3]]).T) + +Retrieve the list of selected computations by stopping the gathering mode then generating them +from the "dispense" function. + +.. code-block:: python + + selector.start_dispensing() + print(list(selector.dispense())) # [(3, 3.), (2, 2.)] \ No newline at end of file diff --git a/docs/components/simulate.rst b/docs/components/simulate.rst new file mode 100644 index 0000000..a07de61 --- /dev/null +++ b/docs/components/simulate.rst @@ -0,0 +1,93 @@ +Simulate +======== + +The Simulate module defines restricted interfaces to computational chemistry codes. + + +Accuracy Levels +--------------- + +The core concept behind simulation in ExaMol is that there are few levels +The idea is to prioritize consistency of settings across computations +over flexibility in being able to run slightly-different computations in a workflow. + +Each level maps to a different computational chemistry code using a specific set of parameters +that are validated for the `recipes `_ available through ExaMol. + +.. |ASESimulator| replace:: :class:`~examol.simulate.ase.ASESimulator` +.. list-table:: + :header-rows: 1 + + * - Name + - Interface + - Code + - Description + * - xtb + - |ASESimulator| + - xTB + - Tight binding using the GFN2-xTB parameterization + * - cp2k_blyp_szv + - |ASESimulator| + - CP2K + - Gaussian-Augmented Plane Wave DFT with a BLYP XC function and the SZV-GTH basis set + * - cp2k_blyp_dzvp + - |ASESimulator| + - CP2K + - Gaussian-Augmented Plane Wave DFT with a BLYP XC function and the DZVP-GTH basis set + * - cp2k_blyp_tzvp + - |ASESimulator| + - CP2K + - Gaussian-Augmented Plane Wave DFT with a BLYP XC function and the TZVP-GTH basis set + + +After selecting a level of accuracy, select the interface needed to run it. + + +The Simulator Interface +----------------------- + +ExaMol provides workflow-compatible interfaces for common operations in quantum chemistry +through the :class:`~examol.simulate.base.BaseSimulator` interface. +Each Simulator implementation provides functions to compute the energy of a structure +and a function to perform a geometry optimization which take inputs and produce outputs +suitable for transmitting between computers. + +Create a simulator interface by providing it first with any options needed to run on +your specific supercomputer. +An interface that will use CP2K could, for example, require a path to the scratch directory +and the mpiexec command used to launch it. + +.. code-block:: python + + sim = ASESimulator( + scratch_dir='cp2k-files', + cp2k_command=f'mpiexec -n 8 --ppn 4 --cpu-bind depth --depth 8 -env OMP_NUM_THREADS=8 ' + '/path/to/exe/local_cuda/cp2k_shell.psmp', + ) + +The interface can then run the energy computations or optimizations with CP2K. +Each computation returns a :class:`~examol.simulate.base.SimResult` object containing the +energy and structure of the outputs. + +.. code-block:: python + + out_res, traj_res, _ = sim.optimize_structure( + xyz=xyz, + config_name='cp2k_blyp_dzvp', + charge=0 + ) + solv_res, _ = sim.compute_energy( + xyz=out_res.xyz, + config_name='cp2k_blyp_dzvp', + charge=1, + solvent='acn' + ) + +.. _levels: + +Adding New Accuracy Levels +-------------------------- + +.. note:: + + Work in Progress. Logan is working to make this easier (see `Issue #40 `_) diff --git a/docs/components/steer.rst b/docs/components/steer.rst new file mode 100644 index 0000000..05f278d --- /dev/null +++ b/docs/components/steer.rst @@ -0,0 +1,57 @@ +Steer +===== + +ExaMol scales to use large supercomputers by managing many tasks together. +The logic for when to launch tasks and how to process completed tasks are defined +as `Colmena `_ "Thinker" classes. +ExaMol will contain several different Thinkers, which each use different strategies +for deploying tasks on a supercomputer. + +Single Objective Thinker as an Example +-------------------------------------- + +The :class:`~examol.steer.single.SingleObjectiveThinker` is a good example for explaining how Thinkers work in ExaMol. + +The strategy for this thinker is three parts: + +#. Never leave nodes on the super +#. Update the list of selected calculations with new data as quickly as possible +#. Wait until resources are free until submitting the next calculation. + +This strategy is achieved by writing out a series of simple policies, such as: + +- Submit a new quantum chemistry calculation when another completes +- Begin re-training models as soon as a recipe is complete for any molecule +- Re-run inference for all molecules as soon as all models finish training + +These policy steps are defined as methods of the Thinker marked with a special decorator +(see `Colmena's quickstart `_). +For example, the "submit a new quantum chemistry" policy is defined by a pair of methods + +.. code-block:: python + + class SingleObjectiveThinker(MoleculeThinker): + ... + @result_processor(topic='simulation') + def store_simulation(self, result: Result): + """Store the output of a simulation""" + # Trigger a new simulation to start + self.rec.release() + ... + + @task_submitter() + def submit_simulation(self): + """Submit a simulation task when resources are available""" + record, suggestion = next(self.task_iterator) # Returns a molecule record and the suggested computation + ... + + + +``store_simulation``, runs when a simulation result completes +and starts by marking resources available before updating the database +and - if conditions are right - retraining the models. +``submit_simulation`` is started as soon as resources are marked as free, +keeping the supercomputer occupied. + +The other methods manage keeping machine learning models up-to-date and +ensuring the task iterator (``self.task_iterator``) produces the best possible computations to run. diff --git a/docs/components/store.rst b/docs/components/store.rst new file mode 100644 index 0000000..d714950 --- /dev/null +++ b/docs/components/store.rst @@ -0,0 +1,95 @@ +Store +===== + +The Store module handles capturing data about molecules and using collected data to compute derived properties. + +Data Models +----------- + +The :class:`~examol.store.models.MoleculeRecord` captures information on a molecule.\ [1]_ +Information includes identifiers (e.g., SMILES string, project-specific names), +organizational metadata (e.g., membership in project-specific subsets), +energies of the molecule in different geometries (i.e., conformers), +and properties derived from the energies. + +Energies are stored as a list of :class:`~examol.store.models.Conformer` objects. +Each of the Conformer objects are different geometries\ [2]_ and we store the energies under different conditions +(e.g., computational methods, charge states) as :class:`~examol.store.models.EnergyEvaluation` objects. + +Create a record and populate information about it by +creating a blank Record from a molecule identifier (i.e., SMILES) +then providing a simulation result to its `add_energies` method. + +.. code-block:: python + + record = MoleculeRecord.from_identifier('C') + sim_result = SimResult( + xyz='5\nmethane\n0.0000...' + charge=0, + energy=-1, + config_name='test', + solvent=None + ) # an example result (normally not created manually) + record.add_energies(sim_result) + +You can then look up the stored energies for a molecule from the record. +For example, ExaMol provides a utility operation for finding the lowest-energy conformer: + +.. code-block:: python + + conf, energy = record.find_lowest_conformer(config_name='test', charge=0, solvent=None) + assert isclose(energy, -1) + assert conf.xyz.startswith('5\nmethane\n0.0000') + +Technical Details +~~~~~~~~~~~~~~~~~ + +The data models are implemented as MongoEngine :class:`~mongoengine.Document` objects +so that they are easy to store in MongoDB, convert to JSON objects, etc. + +Recipes +------- + +Recipes define how to compute property of a molecule from multiple energy computations. +All are based on the :class:`~examol.store.recipes.PropertyRecipe` object, and provide a +function to compute the property from a molecule data record +and second to generate the list of computations required to complete a computation. + +Use an existing recipe by specifying details on the property (e.g., which solvent?) and +the target level of accuracy. +Consult the `API docs <../api/examol.store.html#module-examol.store.recipes>`_ for properties available in ExaMol. + +The recipe will then create an informative name for the property and a level of accuracy: + +.. code-block:: python + + recipe = RedoxEnergy(charge=1, config_name='test', solvent='acn', vertical=False) + print(recipe.name) # reduction_potential + print(recipe.level) # test_acn_vertical + + +You can then use the recipe to determine what is left to do for a recipe + +.. code-block:: python + + to_do = recipe.suggest_computations(record) + +or compute the property then store it in a data record. + +.. code-block:: python + + recipe.update_record(record) + print(record.properties['reduction_potential']['test_acn_vertical']) # Value of the property + + + +.. note:: + Creating a new recipe is a work in progress. + Open an issue, and a developer will help figure out how to make a new recipe. + + +.. [1] We define a molecule as unique based on its chemical formula (including H's), connectivity, and stereochemistry. + Stereoisomers are different molecules, molecules that only differ by charge are the same. + +.. [2] Geometries are the same atom positions do not different displaced by more than 10\ :sup:`-3` Å, + when both have a center of mass at the origin. We do not attempt to determine if molecules have different rotations. diff --git a/docs/conf.py b/docs/conf.py index c5e67ed..7861eb4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,11 +9,22 @@ project = 'ExaMol' copyright = '2023, Logan Ward' author = 'Logan Ward' +html_title = 'ExaMol' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = [] +extensions = [ + 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', + 'sphinx.ext.viewcode' +] + +intersphinx_mapping = {'python': ('https://docs.python.org/3', None), + 'mongoengine': ('https://docs.mongoengine.org/', None), + 'parsl': ('https://parsl.readthedocs.io/en/stable/', None), + 'colmena': ('https://colmena.readthedocs.io/en/stable/', None)} +autodoc_mock_imports = ["tensorflow", "sklearn"] templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] diff --git a/docs/index.rst b/docs/index.rst index 69a85b6..b8cd780 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,3 @@ -.. ExaMol documentation master file, created by - sphinx-quickstart on Wed Mar 8 16:51:45 2023. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - ExaMol ====== @@ -18,13 +13,18 @@ Once defined, execute by calling it from the command line: examol run examples/redoxmers/spec.py:spec -Begin with our `Quickstart `_ to learn how to create the specification file, +Begin with our `Quickstart `_ to learn how to create the specification file then continue with the following section about the components of an ExaMol application -if your science requires new capabilities. +before finding the components needed for your application in the API documentation. + +Source Code: https://github.com/exalearn/ExaMol .. toctree:: - :maxdepth: 1 + :maxdepth: 2 :caption: Contents: + installation quickstart + components/index + api/examol diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..d9f40e6 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,38 @@ +Installation +============ + +ExaMol can be installed as a normal Python package. +We recommend installing that package inside of a virtual environment, +and prefer to use Anaconda to build the environment because it can install +non-Python dependencies (e.g., MongoDB). + +The first step to installing ExaMol is to download it from GitHub. +For example, create an updatable copy of the repository by + +.. code-block:: shell + + git clone git@github.com:exalearn/ExaMol.git + +Recommended: Anaconda +--------------------- + +The ``envs`` folder of ExaMol contains environment files suitable for different computers. + +The best one to start with installs CPU versions of all software: + +.. code-block:: shell + + conda env create --file envs/environment-cpu.yaml + +Installation with Pip +--------------------- + +Start by creating or activating the virtual environment in which you will run ExaMol then invoke pip + +.. code-block:: shell + + pip install -e . + +The default installation will install all _necessary_ packages but will skip some required for +some components, such as ``tensorflow`` and ``nfp`` for the :class:`~examol.scorer.nfp.NFPScorer`. +You may need to install these packages as you test ExaMol on your system. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index f32f756..f038cda 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -1,4 +1,169 @@ Quickstart ========== -TBD \ No newline at end of file +Let us consider the +`redoxmer design example `_ +as a way to learn how to use ExaMol. + +.. note:: + This example assumes you installed xTB and other optional dependencies. + We recommend you install the `CPU version of ExaMol via Anaconda `_. + +Running ExaMol +-------------- + +Run ExaMol by calling a command-line executable which launches computations across many resources. + +The executable takes at least one argument: the path to a Python specification file and the name of the variable +within that file which is the specification object. + +.. code-block:: shell + + examol run examples/redoxmers/spec.py:spec + +ExaMol will start writing logging messages to the screen to tell you what it is doing, +which is first to execute the specification file and load in the file you want + +.. code-block:: + + 2023-06-08 12:57:10,063 - examol - INFO - Starting ExaMol v0.0.1 + 2023-06-08 12:57:11,916 - examol.cli - INFO - Loaded specification from spec.py, where it was named spec + +Once loaded, ExaMol will create the functions to be executed (e.g., "run quantum chemistry") +and start a Parsl workflow engine in a subprocess. +The program will then launch `a steering engine <#steering-strategy>`_ in a thread before beginning +a series of monitoring routines as other threads. + +ExaMol will continue writing logging message to screen from all of these threads and will exit +once the steering engine completes. + +Understanding Outputs +--------------------- + +All data from an ExaMol run is written to the output directory defined in the specification file. + +Common files for the workflow include: + +- ``run.log``: The logging messages +- ``*-results.json``: Metadata about each task completed by ExaMol (e.g., if successful, when started) in + a line-delimited JSON format. Records follow Colmena's :class:`~colmena.models.Result` schema. +- ``database.json``: Data about each molecule assessed by ExaMol where each line follows + the :class:`~examol.store.models.MoleculeRecord` format. +- ``report.md``: A report of the workflow performance thus far. + +The run directory will contain data from all previous runs. + +.. note:: The example specification deletes any previous runs, but this is just for demonstration purposes. + +Configuring ExaMol +------------------ + +An ExaMol application is divided into a _Thinker_ which defines which tasks to run +and a _Doer_ which executes them on HPC resources. +Your task is to define the Thinker and Doer by a Python specification object. + +The specification object, :class:`~examol.specify.ExaMolSpecification`, +describes what thinker is seeking to optimize, +how it will select calculations, +what those computations are, +and a description of the resources available to it. +A simple example looks something like: + +.. code-block:: python + + recipe = RedoxEnergy(charge=1, compute_config='xtb') # What we're trying to optimize + spec = ExaMolSpecification( + database='training-data.json', + recipe=recipe, + search_space='search_space.smi', + selector=GreedySelector(n_to_select=8, maximize=True), + simulator=ASESimulator(scratch_dir='/tmp'), + scorer=RDKitScorer(), + models=[KNeighborsRegressor()], + num_to_run=8, + thinker=SingleObjectiveThinker, + thinker_options=dict(num_workers=2), + compute_config=config, + run_dir='run' + ) + +We'll go through each option briefly here, +and link out to pages that describe the full options available for each. + +Quantum Chemistry +~~~~~~~~~~~~~~~~~ + +The ``recipe`` and ``simulator`` options define which molecule property to compute +and an interface for ExaMol to compute it, respectively. + +Both recipes and simulator are designed to ensure all calculations in a set are performed with consistent settings. +ExaMol defines a set of pre-defined levels of accuracies, which are enumerated in +`the Simulate documentation `_. + +Recipes are based on the :class:`~examol.store.recipes.PropertyRecipe` class, +and implement methods to compute a certain property and determine which computations are needed. +Your specification will contain the details of what you wish to compute (e.g., which solvent for a solvation energy) +and the level of accuracy to compute it (e.g., which XC functional)? +See the list recipes and learn how to make your own `in the component documentation `_. + +The simulator is based on :class:`~examol.simulate.BaseSimulator` class and +defines an interface to the computational chemistry code used to assess molecular energies. +Your specification will contain information on how to run each supported code on a specific supercomputer, +such as the path to its executable and how many nodes to use for each task. +See how to create one in the `Simulate documentation `_. + +Starting Data +~~~~~~~~~~~~~ + +The starting data for a project is a line-delimited JSON describing what molecular properties are already known. +Each line of the file is a different molecule, with data following the :class:`~examol.store.models.MoleculeRecord` format. + +We recommend creating the initial database by running a seed set of molecules with a purpose-built scripts. + +.. note:: I'll upload some example scripts soon and describe them here. + +Machine Learning +~~~~~~~~~~~~~~~~ + +ExaMol uses machine learning (ML) to estimate the output of computations. +The specification requires you to define an interface to run machine learning models (``scorer``) and +then a set of models (``models``) to be trained using that interface. + +The Scorer, like the `Simulator used in quantum chemistry <#quantum-chemistry>`_, defines an interface +for the ML computations should be configured with information about how to run the model on your resources. +ExaMol provides interfaces for `a few common libraries `_) used in ML for molecular properties. + +The ``models`` define specific architectures used by the scorer. +Each model will be trained using a different subset of the training data, +and the predictions of all models will be combined to produce predictions with uncertainties for each model. + +Search Algorithm +~~~~~~~~~~~~~~~~ + +The design process is defined by the space of molecules (``search_space``), +how to search through them (``selector``), +and how many quantum chemistry computations will be run (``num_to_run``). + +The ``search_space`` option requires the path to a list of SMILES strings as a single file. + +The selector defines an adaptive experimental design algorithm -- an algorithm which uses the predictions +from machine learning models to identify the best computations. +ExaMol includes `several selection routines `_. + +Steering Strategy +~~~~~~~~~~~~~~~~~ + +The ``thinker`` provides the core capability behind ExaMol scaling to large supercomputers: +the ability to schedule many different different tasks at once. +A Thinker strategy defines when to submit new tasks and what to do once they complete. +There is only one strategy available in ExaMol right now, :class:`~examol.steer.single.SingleObjectiveThinker`, +but more will become available as we build the library. + +Learn more in the `component documentation `_. + +Computational Resources +~~~~~~~~~~~~~~~~~~~~~~~ + +``compute_config`` requires a Parsl :class:`~parsl.config.Config` object describing the resources available to ExaMol. +Parsl's `quickstart describes the basics `_ of +how to describe the queueing system and compute nodes of your supercomputer. diff --git a/docs/requirements.txt b/docs/requirements.txt index 7bfd894..56ec255 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,3 @@ sphinx==7.* furo +nfp diff --git a/envs/environment-cpu.yml b/envs/environment-cpu.yml index 082a6ec..ec70de6 100644 --- a/envs/environment-cpu.yml +++ b/envs/environment-cpu.yml @@ -7,10 +7,13 @@ dependencies: - python==3.10.* - pandas==1.* - xtb-python==22.* + - scikit-learn>=1 - pymongo - jupyterlab - matplotlib - pytest - pip - pip: - - -e .. + - tensorflow-cpu + - nfp + - -e .. diff --git a/examol/score/nfp.py b/examol/score/nfp.py index 40aa2c9..7db3f23 100644 --- a/examol/score/nfp.py +++ b/examol/score/nfp.py @@ -28,7 +28,12 @@ def get_config(self): config['reduction_op'] = self.reduction_op return config - def call(self, inputs, mask=None): # pragma: no cover + def call(self, inputs, mask=None): + """ + Args: + inputs: Matrix to be reduced + mask: Identifies which rows to sum are placeholders + """ masked_tensor = tf.ragged.boolean_mask(inputs, mask) reduce_fn = getattr(tf.math, f'reduce_{self.reduction_op}') return reduce_fn(masked_tensor, axis=1) @@ -47,13 +52,14 @@ def make_simple_network( atomwise: bool = True, ) -> tf.keras.models.Model: """Construct a Keras model using the settings provided by a user + Args: atom_features: Number of features used per atom and bond message_steps: Number of message passing steps output_layers: Number of neurons in the readout layers reduce_op: Operation used to reduce from atom-level to molecule-level vectors atomwise: Whether to reduce atomwise contributions to form an output, - or reduce to a single vector per molecule before the output layers + or reduce to a single vector per molecule before the output layers Returns: A model instantiated with the user-defined options """ diff --git a/examol/simulate/ase/utils.py b/examol/simulate/ase/utils.py index b8b6384..101d3b4 100644 --- a/examol/simulate/ase/utils.py +++ b/examol/simulate/ase/utils.py @@ -12,7 +12,7 @@ def make_ephemeral_calculator(calc: Calculator | dict) -> Iterator[Calculator]: """Make a calculator then tear it down after completion Args: - calc: Already-defined calculatori or a dict defining it. + calc: Already-defined calculator or a dict defining it. The dict must contain the key "name" to define the name of the code and could contain the keys "args" and "kwargs" to define the arguments and keyword arguments for creating diff --git a/examol/simulate/base.py b/examol/simulate/base.py index 57b4170..763218e 100644 --- a/examol/simulate/base.py +++ b/examol/simulate/base.py @@ -11,25 +11,22 @@ @dataclass() class SimResult: - """Stores the results from a calculation in a code-agnostic format - - Attributes: - config_name: Name of the configuration used to compute the energy - charge: Charge of the molecule - solvent: Solvent around the molecule, if any - xyz: XYZ-format structure, adjusted such that the center of mass is zero - energy: Energy of the molecule (units: eV) - forces: Forces acting on each atom (units: eV/Ang) - """ + """Stores the results from a calculation in a code-agnostic format""" # Information about the result config_name: str = field() + """Name of the configuration used to compute the energy""" charge: int = field() + """Charge of the molecule""" solvent: str | None = field() + """Solvent around the molecule, if any""" # Outputs xyz: str = field(repr=False) + """XYZ-format structure, adjusted such that the center of mass is at the origin""" energy: float | None = None + """Energy of the molecule (units: eV)""" forces: np.ndarray | None = None + """Forces acting on each atom (units: eV/Ang)""" def __post_init__(self): # Ensure the XYZ is centered about zero diff --git a/examol/specify.py b/examol/specify.py index 8d9de7f..eba278c 100644 --- a/examol/specify.py +++ b/examol/specify.py @@ -35,42 +35,41 @@ class ExaMolSpecification: - *Single Executor*: Specify a single executor and have ExaMol use that executor for all tasks - *Split Executor*: Specify two executors and label one "learning" and the other "simulation" to have the AI tasks be placed on one resource and simulation on the other. - - Attributes: - simulator: Tool used to perform quantum chemistry computations - recipe: Definition for how to compute the target property - selector: How to identify which computation to perform next - scorer: Defines algorithms used to retrain :attr:`models` - models: List of models that will be trained to predict :attr:`recipe` - database: Path to the initial dataset - search_space: Path to the molecules over which to search. Can either be a `.smi` file or a `.csv` where the first column - is the smiles string and the second is a form ready for inference with :attr:`scorer`. - thinker: Tool used to schedule computations - compute_config: Description of the available resources via Parsl. See :class:`~parsl.config.Config`. - reporters: List of classes which provide users with real-time information - num_to_run: Number of quantum chemistry computations to perform """ # Define the problem database: Path | str = ... + """Path to the initial dataset""" recipe: PropertyRecipe = ... + """Definition for how to compute the target property""" search_space: Path | str = ... + """Path to the molecules over which to search. Can either be a `.smi` file or a `.csv` where the first column + is the smiles string and the second is a form ready for inference with :attr:`scorer`.""" selector: Selector = ... + """How to identify which computation to perform next""" scorer: Scorer = ... + """Defines algorithms used to retrain and run :attr:`models`""" models: list[object] = ... + """List of machine learning models used to predict outcome of :attr:`recipe`""" simulator: BaseSimulator = ... + """Tool used to perform quantum chemistry computations""" num_to_run: int = ... + """Number of quantum chemistry computations to perform""" # Define how we create the thinker thinker: type[SingleObjectiveThinker] = ... + """Policy used to schedule computations""" thinker_options: dict[str, object] = field(default_factory=dict) # Define how we communicate to the user reporters: list[BaseReporter] = field(default_factory=list) + """List of classes which provide users with real-time information""" # Define the computing resources compute_config: Config = ... + """Description of the available resources via Parsl. See :class:`~parsl.config.Config`.""" run_dir: Path | str = ... + """Path in which to write output files""" def assemble(self) -> tuple[BaseTaskServer, MoleculeThinker]: """Assemble the Colmena application""" diff --git a/examol/steer/single.py b/examol/steer/single.py index 6abc0b0..3cf4b97 100644 --- a/examol/steer/single.py +++ b/examol/steer/single.py @@ -18,7 +18,21 @@ class SingleObjectiveThinker(MoleculeThinker): - """A thinker which submits all computations needed to evaluate a molecule whenever it is selected""" + """A thinker which submits all computations needed to evaluate a molecule whenever it is selected + + Args: + queues: Queues used to communicate with the task server + run_dir: Directory in which to store logs, etc. + recipe: Recipe used to compute the target property + database: List of molecules which are already known + scorer: Tool used as part of model training + models: Models used to predict target property + selector: Tool used to pick which computations to run + num_to_run: Number of molecules to evaluate + search_space: Search space of molecules. Provided as an iterator over pairs of SMILES string and molecule in format ready for use with models + num_workers: Number of simulation tasks to run in parallel + inference_chunk_size: Number of molecules to run inference on per task + """ def __init__(self, queues: ColmenaQueues, @@ -32,21 +46,6 @@ def __init__(self, search_space: Iterable[tuple[str, object]], num_workers: int = 2, inference_chunk_size: int = 10000): - """ - - Args: - queues: Queues used to communicate with the task server - run_dir: Directory in which to store logs, etc. - recipe: Recipe used to compute the target property - database: List of molecules which are already known - scorer: Tool used as part of model training - models: Models used to predict target property - selector: Tool used to pick which computations to run - num_to_run: Number of molecules to evaluate - search_space: Search space of molecules. Provided as an iterator over pairs of SMILES string and molecule in format ready for use with models - num_workers: Number of simulation tasks to run in parallel - inference_chunk_size: Number of molecules to run inference on per task - """ super().__init__(queues, ResourceCounter(num_workers), run_dir, search_space, database, inference_chunk_size) # Store the selection equipment diff --git a/examol/store/models.py b/examol/store/models.py index 07fe6e2..a6801ae 100644 --- a/examol/store/models.py +++ b/examol/store/models.py @@ -17,17 +17,24 @@ class Identifiers(DynamicEmbeddedDocument): """IDs known for a molecule""" smiles = StringField(required=True) + """A SMILES string""" inchi = StringField(required=True) + """The InChI string""" pubchem_id = IntField() + """PubChem ID, if known""" class EnergyEvaluation(EmbeddedDocument): """Energy of a conformer under a certain condition""" - energy = FloatField(required=True, help_text='Energy of the conformer (eV)') - config_name = StringField(required=True, help_text='Configuration used to compute the energy') - charge = IntField(required=True, help_text='Charge used when computing the energy') - solvent = StringField(help_text='Solvent used, if any') + energy = FloatField(required=True) + """Energy of the conformer (eV)""" + config_name = StringField(required=True) + """Configuration used to compute the energy""" + charge = IntField(required=True) + """Charge used when computing the energy""" + solvent = StringField() + """Solvent used, if any""" def __eq__(self, other): if not isinstance(other, self.__class__): @@ -41,17 +48,24 @@ class Conformer(EmbeddedDocument): """Describes a single conformer of a molecule""" # Define the structure - xyz = StringField(required=True, help_text='XYZ-format description of the atomic coordinates') - xyz_hash = StringField(required=True, help_text='MD5 hash of xyz') + xyz = StringField(required=True) + """XYZ-format description of the atomic coordinates""" + xyz_hash = StringField(required=True) + """MDF hash of the XYZ coordinates""" # Provenance of the structure - date_created = DateTimeField(required=True, help_text='Date this conformer was inserted') - source = StringField(required=True, choices=['relaxation', 'other'], help_text='Method used to generate this structure') - config_name = StringField(help_text='Configuration used to relax the structure') - charge = IntField(help_text='Charge used when relaxing the structure') + date_created = DateTimeField(required=True) + """Date this conformer was inserted""" + source = StringField(required=True, choices=['relaxation', 'other']) + """Method used to generate this structure (e.g., via relaxation)""" + config_name = StringField() + """Configuration used to relax the structure, if applicable""" + charge = IntField() + """Charge used when relaxing the structure""" # Energies of the structure - energies: list[EnergyEvaluation] = ListField(EmbeddedDocumentField(EnergyEvaluation), help_text='List of energies for this structure') + energies: list[EnergyEvaluation] = ListField(EmbeddedDocumentField(EnergyEvaluation)) + """List of energies for this structure""" @property def atoms(self) -> ase.Atoms: @@ -136,16 +150,22 @@ class MoleculeRecord(Document): """Defines whatever we know about a molecule""" # Identifiers - key = StringField(min_length=27, max_length=27, required=True, primary_key=True, help_text='InChI key') - identifier: Identifiers = EmbeddedDocumentField(Identifiers, help_text='Collection of identifiers which define the molecule') - names = ListField(StringField(), help_text='Names this molecule is known by') - subsets = ListField(StringField(), help_text='List of subsets this molecule is part of') + key = StringField(min_length=27, max_length=27, required=True, primary_key=True) + """InChI key""" + identifier: Identifiers = EmbeddedDocumentField(Identifiers, help_text='') + """Collection of identifiers which define the molecule""" + names = ListField(StringField()) + """Names this molecule is known by""" + subsets = ListField(StringField()) + """List of subsets this molecule is part of""" # Data about the molecule conformers: list[Conformer] = ListField(EmbeddedDocumentField(Conformer)) + """All known conformers for this molecule""" # Characteristics - properties: dict[str, dict[str, float]] = DictField(help_text='Properties available for the molecule') + properties: dict[str, dict[str, float]] = DictField() + """Properties available for the molecule""" @classmethod def from_identifier(cls, smiles: str | None = None, inchi: str | None = None): diff --git a/examol/store/recipes.py b/examol/store/recipes.py index f3071ec..565b5a4 100644 --- a/examol/store/recipes.py +++ b/examol/store/recipes.py @@ -10,21 +10,18 @@ @dataclass class SimulationRequest: - """Request for a specific simulation type - - Attributes: - xyz: XYZ structure to use as the starting point - optimize: Whether to perform an optimization - config_name: Name of the computation - charge: Charge on the molecule - solvent: Name of solvent, if any - """ + """Request for a specific simulation type""" xyz: str = field(repr=False) + """XYZ structure to use as the starting point""" optimize: bool = ... + """Whether to perform an optimization""" config_name: str = ... + """Name of the computation configuration""" charge: int = ... + """Charge on the molecule""" solvent: str | None = ... + """Name of solvent, if any""" class PropertyRecipe: diff --git a/pyproject.toml b/pyproject.toml index eebc60e..25d8e01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,12 +27,8 @@ dependencies = [ "rdkit>=2022.9", "networkx>=2.8", "parsl>=2022.12", - "scikit-learn==1.2.*", "more_itertools==9.*", - "foundry_ml", "mongoengine>=0.27", - "tensorflow>=2.7,<3", - "nfp", "tabulate>=0.9", "tqdm" ]