metatensor · frostedoyster · Dec 9, 2024 · Dec 9, 2024 · Dec 13, 2024 · Dec 16, 2024
diff --git a/docs/src/getting-started/usage.rst b/docs/src/getting-started/usage.rst
@@ -52,7 +52,7 @@ the restart ``options_restart.yaml`` file. To start the training create an
 
 .. literalinclude:: ../../../examples/basic_usage/usage.sh
     :language: bash
-    :lines: 3-8
+    :lines: 3-10
 
 After the training has finished, the ``mtt train`` command generates the ``model.ckpt``
 (final checkpoint) and ``model.pt`` (exported model) files in the current directory, as
@@ -69,8 +69,8 @@ The sub-command to evaluate an already trained model is
     mtt eval
 
 Besides the trained ``model``, you will also have to provide a file containing the
-system and possible target values for evaluation. The system of this ``eval.yaml``
-is exactly the same as for a dataset in the ``options.yaml`` file.
+system and possible target values for evaluation. The system section of this
+``eval.yaml`` is exactly the same as for a dataset in the ``options.yaml`` file.
 
 .. literalinclude:: ../../static/qm9/eval.yaml
    :language: yaml
@@ -82,8 +82,10 @@ typing
 
 .. literalinclude:: ../../../examples/basic_usage/usage.sh
     :language: bash
-    :lines: 9-24
+    :lines: 12-27
 
+An important parameter of ``mtt eval`` is the ``-b`` (or ``--batch-size``) option,
+which allows you to specify the batch size for the evaluation.
 
 Molecular simulations
 #####################

diff --git a/src/metatrain/cli/eval.py b/src/metatrain/cli/eval.py
@@ -84,6 +84,15 @@ def _add_eval_model_parser(subparser: argparse._SubParsersAction) -> None:
         default="output.xyz",
         help="filename of the predictions (default: %(default)s)",
     )
+    parser.add_argument(
+        "-b",
+        "--batch-size",
+        dest="batch_size",
+        required=False,
+        type=int,
+        default=1,
+        help="batch size for evaluation (default: %(default)s)",
+    )
     parser.add_argument(
         "--check-consistency",
         dest="check_consistency",
@@ -162,6 +171,7 @@ def _eval_targets(
     dataset: Union[Dataset, torch.utils.data.Subset],
     options: Dict[str, TargetInfo],
     return_predictions: bool,
+    batch_size: int = 1,
     check_consistency: bool = False,
 ) -> Optional[Dict[str, TensorMap]]:
     """Evaluates an exported model on a dataset and prints the RMSEs for each target.
@@ -192,10 +202,21 @@ def _eval_targets(
     logger.info(f"Running on device {device} with dtype {dtype}")
     model.to(dtype=dtype, device=device)
 
+    if len(dataset) % batch_size != 0:
+        # debug level: we don't want to clutter the output at the end of training
+        # gross issues will still show up in the standard deviation of the timings
+        logger.debug(
+            f"The dataset size ({len(dataset)}) is not a multiple of the batch size "
+            f"({batch_size}). {len(dataset) // batch_size} batches will be "
+            f"constructed with a batch size of {batch_size}, and the last batch will "
+            f"have a size of {len(dataset) % batch_size}. This might lead to "
+            "inaccurate average timings."
+        )
+
     # Create a dataloader
     dataloader = torch.utils.data.DataLoader(
         dataset,
-        batch_size=1,  # TODO: allow to set from outside!!
+        batch_size=batch_size,
         collate_fn=collate_fn,
         shuffle=False,
     )
@@ -302,6 +323,7 @@ def eval_model(
     model: Union[MetatensorAtomisticModel, torch.jit._script.RecursiveScriptModule],
     options: DictConfig,
     output: Union[Path, str] = "output.xyz",
+    batch_size: int = 1,
     check_consistency: bool = False,
 ) -> None:
     """Evaluate an exported model on a given data set.
@@ -366,6 +388,7 @@ def eval_model(
                 dataset=eval_dataset,
                 options=eval_info_dict,
                 return_predictions=True,
+                batch_size=batch_size,
                 check_consistency=check_consistency,
             )
         except Exception as e:

diff --git a/src/metatrain/cli/train.py b/src/metatrain/cli/train.py
@@ -464,6 +464,16 @@ def train_model(
     )
     mts_atomistic_model = mts_atomistic_model.to(final_device)
 
+    batch_size = _get_batch_size_from_hypers(hypers)
+    if batch_size is None:
+        logger.warning(
+            "Could not find batch size in hypers dictionary. "
+            "Using default value of 1 for final evaluation."
+        )
+        batch_size = 1
+    else:
+        logger.info(f"Running final evaluation with batch size {batch_size}")
+
     for i, train_dataset in enumerate(train_datasets):
         if len(train_datasets) == 1:
             extra_log_message = ""
@@ -476,6 +486,7 @@ def train_model(
             train_dataset,
             dataset_info.targets,
             return_predictions=False,
+            batch_size=batch_size,
         )
 
     for i, val_dataset in enumerate(val_datasets):
@@ -490,6 +501,7 @@ def train_model(
             val_dataset,
             dataset_info.targets,
             return_predictions=False,
+            batch_size=batch_size,
         )
 
     for i, test_dataset in enumerate(test_datasets):
@@ -504,4 +516,23 @@ def train_model(
             test_dataset,
             dataset_info.targets,
             return_predictions=False,
+            batch_size=batch_size,
         )
+
+
+def _get_batch_size_from_hypers(hypers: Union[Dict, DictConfig]) -> Optional[int]:
+    # Recursively searches for "batch_size", "batch-size", "batch size", "batchsize",
+    # or their upper-case equivalents in the hypers dictionary and returns the value.
+    # If not found, it returns None.
+    for key in hypers.keys():
+        value = hypers[key]
+        if isinstance(value, dict) or isinstance(value, DictConfig):
+            batch_size = _get_batch_size_from_hypers(value)
+            if batch_size is not None:
+                return batch_size
+        if (
+            key.lower().replace("_", "").replace("-", "").replace(" ", "")
+            == "batchsize"
+        ):
+            return value
+    return None
diff --git a/tests/cli/test_eval_model.py b/tests/cli/test_eval_model.py
@@ -78,6 +78,38 @@ def test_eval(monkeypatch, tmp_path, caplog, model_name, options):
     frames[0].info["energy"]
 
 
+@pytest.mark.parametrize("model_name", ["model-32-bit.pt", "model-64-bit.pt"])
+def test_eval_batch_size(monkeypatch, tmp_path, caplog, model_name, options):
+    """Test that eval via python API runs without an error raise."""
+    monkeypatch.chdir(tmp_path)
+    caplog.set_level(logging.DEBUG)
+
+    shutil.copy(RESOURCES_PATH / "qm9_reduced_100.xyz", "qm9_reduced_100.xyz")
+
+    model = torch.jit.load(RESOURCES_PATH / model_name)
+
+    eval_model(
+        model=model,
+        options=options,
+        output="foo.xyz",
+        batch_size=13,
+        check_consistency=True,
+    )
+
+    # Test target predictions
+    log = "".join([rec.message for rec in caplog.records])
+    assert "energy RMSE (per atom)" in log
+    assert "energy MAE (per atom)" in log
+    assert "dataset with index" not in log
+    assert "evaluation time" in log
+    assert "ms per atom" in log
+    assert "inaccurate average timings" in log
+
+    # Test file is written predictions
+    frames = ase.io.read("foo.xyz", ":")
+    frames[0].info["energy"]
+
+
 def test_eval_export(monkeypatch, tmp_path, options):
     """Test evaluation of a trained model exported but not saved to disk."""
     monkeypatch.chdir(tmp_path)

diff --git a/tests/cli/test_train_model.py b/tests/cli/test_train_model.py
@@ -101,6 +101,7 @@ def test_train(capfd, monkeypatch, tmp_path, output):
     assert "train" in stdout_log
     assert "energy" in stdout_log
     assert "with index" not in stdout_log  # index only printed for more than 1 dataset
+    assert "Running final evaluation with batch size 16" in stdout_log
 
 
 @pytest.mark.parametrize(
@@ -554,25 +555,6 @@ def test_architecture_error(options, monkeypatch, tmp_path):
         train_model(options)
 
 
-def test_train_issue_290(monkeypatch, tmp_path):
-    """Test the potential problem from issue #290."""
-    monkeypatch.chdir(tmp_path)
-    shutil.copy(DATASET_PATH_ETHANOL, "ethanol_reduced_100.xyz")
-
-    structures = ase.io.read("ethanol_reduced_100.xyz", ":")
-    more_structures = structures * 15 + [structures[0]]
-    ase.io.write("ethanol_1501.xyz", more_structures)
-
-    # run training with original options
-    options = OmegaConf.load(OPTIONS_PATH)
-    options["training_set"]["systems"]["read_from"] = "ethanol_1501.xyz"
-    options["training_set"]["targets"]["energy"]["key"] = "energy"
-    options["validation_set"] = 0.01
-    options["test_set"] = 0.85
-
-    train_model(options)
-
-
 def test_train_log_order(caplog, monkeypatch, tmp_path, options):
     """Tests that the log is always printed in the same order for forces
     and virials."""

diff --git a/tests/resources/options.yaml b/tests/resources/options.yaml
@@ -3,8 +3,12 @@ seed: 42
 architecture:
   name: experimental.soap_bpnn
   training:
-    batch_size: 2
+    batch_size: 16
     num_epochs: 1
+  model:
+    soap:
+      max_radial: 4
+      max_angular: 2
 
 training_set:
   systems:

diff --git a/tests/utils/test_llpr.py b/tests/utils/test_llpr.py
@@ -18,9 +18,6 @@
 from . import RESOURCES_PATH
 
 
-torch.manual_seed(42)
-
-
 def test_llpr(tmpdir):
 
     model = load_model(
@@ -102,7 +99,7 @@ def test_llpr(tmpdir):
             params.append(param.squeeze())
     weights = torch.cat(params)
 
-    n_ensemble_members = 10000
+    n_ensemble_members = 1000000  # converges slowly...
     llpr_model.calibrate(dataloader)
     llpr_model.generate_ensemble({"energy": weights}, n_ensemble_members)
     assert "energy_ensemble" in llpr_model.capabilities.outputs
@@ -143,7 +140,7 @@ def test_llpr(tmpdir):
     )
 
     torch.testing.assert_close(
-        analytical_uncertainty, ensemble_uncertainty, rtol=1e-2, atol=1e-2
+        analytical_uncertainty, ensemble_uncertainty, rtol=5e-3, atol=0.0
     )