Add batch size for evaluation (#417)

metatensor · Dec 17, 2024 · 054427e · 054427e
1 parent 71d7ef5
commit 054427e
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 5 deletions.
diff --git a/docs/src/getting-started/usage.rst b/docs/src/getting-started/usage.rst
@@ -52,7 +52,7 @@ the restart ``options_restart.yaml`` file. To start the training create an
 
 .. literalinclude:: ../../../examples/basic_usage/usage.sh
     :language: bash
-    :lines: 3-8
+    :lines: 3-10
 
 After the training has finished, the ``mtt train`` command generates the ``model.ckpt``
 (final checkpoint) and ``model.pt`` (exported model) files in the current directory, as
@@ -69,8 +69,8 @@ The sub-command to evaluate an already trained model is
     mtt eval
 
 Besides the trained ``model``, you will also have to provide a file containing the
-system and possible target values for evaluation. The system of this ``eval.yaml``
-is exactly the same as for a dataset in the ``options.yaml`` file.
+system and possible target values for evaluation. The system section of this
+``eval.yaml`` is exactly the same as for a dataset in the ``options.yaml`` file.
 
 .. literalinclude:: ../../static/qm9/eval.yaml
    :language: yaml
@@ -82,8 +82,10 @@ typing
 
 .. literalinclude:: ../../../examples/basic_usage/usage.sh
     :language: bash
-    :lines: 9-24
+    :lines: 12-27
 
+An important parameter of ``mtt eval`` is the ``-b`` (or ``--batch-size``) option,
+which allows you to specify the batch size for the evaluation.
 
 Molecular simulations
 #####################

diff --git a/src/metatrain/cli/eval.py b/src/metatrain/cli/eval.py
@@ -84,6 +84,15 @@ def _add_eval_model_parser(subparser: argparse._SubParsersAction) -> None:
         default="output.xyz",
         help="filename of the predictions (default: %(default)s)",
     )
+    parser.add_argument(
+        "-b",
+        "--batch-size",
+        dest="batch_size",
+        required=False,
+        type=int,
+        default=1,
+        help="batch size for evaluation (default: %(default)s)",
+    )
     parser.add_argument(
         "--check-consistency",
         dest="check_consistency",
@@ -162,6 +171,7 @@ def _eval_targets(
     dataset: Union[Dataset, torch.utils.data.Subset],
     options: Dict[str, TargetInfo],
     return_predictions: bool,
+    batch_size: int = 1,
     check_consistency: bool = False,
 ) -> Optional[Dict[str, TensorMap]]:
     """Evaluates an exported model on a dataset and prints the RMSEs for each target.
@@ -192,10 +202,21 @@ def _eval_targets(
     logger.info(f"Running on device {device} with dtype {dtype}")
     model.to(dtype=dtype, device=device)
 
+    if len(dataset) % batch_size != 0:
+        # debug level: we don't want to clutter the output at the end of training
+        # gross issues will still show up in the standard deviation of the timings
+        logger.debug(
+            f"The dataset size ({len(dataset)}) is not a multiple of the batch size "
+            f"({batch_size}). {len(dataset) // batch_size} batches will be "
+            f"constructed with a batch size of {batch_size}, and the last batch will "
+            f"have a size of {len(dataset) % batch_size}. This might lead to "
+            "inaccurate average timings."
+        )
+
     # Create a dataloader
     dataloader = torch.utils.data.DataLoader(
         dataset,
-        batch_size=1,  # TODO: allow to set from outside!!
+        batch_size=batch_size,
         collate_fn=collate_fn,
         shuffle=False,
     )
@@ -302,6 +323,7 @@ def eval_model(
     model: Union[MetatensorAtomisticModel, torch.jit._script.RecursiveScriptModule],
     options: DictConfig,
     output: Union[Path, str] = "output.xyz",
+    batch_size: int = 1,
     check_consistency: bool = False,
 ) -> None:
     """Evaluate an exported model on a given data set.
@@ -366,6 +388,7 @@ def eval_model(
                 dataset=eval_dataset,
                 options=eval_info_dict,
                 return_predictions=True,
+                batch_size=batch_size,
                 check_consistency=check_consistency,
             )
         except Exception as e:

diff --git a/src/metatrain/cli/train.py b/src/metatrain/cli/train.py
@@ -464,6 +464,16 @@ def train_model(
     )
     mts_atomistic_model = mts_atomistic_model.to(final_device)
 
+    batch_size = _get_batch_size_from_hypers(hypers)
+    if batch_size is None:
+        logger.warning(
+            "Could not find batch size in hypers dictionary. "
+            "Using default value of 1 for final evaluation."
+        )
+        batch_size = 1
+    else:
+        logger.info(f"Running final evaluation with batch size {batch_size}")
+
     for i, train_dataset in enumerate(train_datasets):
         if len(train_datasets) == 1:
             extra_log_message = ""
@@ -476,6 +486,7 @@ def train_model(
             train_dataset,
             dataset_info.targets,
             return_predictions=False,
+            batch_size=batch_size,
         )
 
     for i, val_dataset in enumerate(val_datasets):
@@ -490,6 +501,7 @@ def train_model(
             val_dataset,
             dataset_info.targets,
             return_predictions=False,
+            batch_size=batch_size,
         )
 
     for i, test_dataset in enumerate(test_datasets):
@@ -504,4 +516,23 @@ def train_model(
             test_dataset,
             dataset_info.targets,
             return_predictions=False,
+            batch_size=batch_size,
         )
+
+
+def _get_batch_size_from_hypers(hypers: Union[Dict, DictConfig]) -> Optional[int]:
+    # Recursively searches for "batch_size", "batch-size", "batch size", "batchsize",
+    # or their upper-case equivalents in the hypers dictionary and returns the value.
+    # If not found, it returns None.
+    for key in hypers.keys():
+        value = hypers[key]
+        if isinstance(value, dict) or isinstance(value, DictConfig):
+            batch_size = _get_batch_size_from_hypers(value)
+            if batch_size is not None:
+                return batch_size
+        if (
+            key.lower().replace("_", "").replace("-", "").replace(" ", "")
+            == "batchsize"
+        ):
+            return value
+    return None
diff --git a/tests/cli/test_eval_model.py b/tests/cli/test_eval_model.py
@@ -78,6 +78,38 @@ def test_eval(monkeypatch, tmp_path, caplog, model_name, options):
     frames[0].info["energy"]
 
 
+@pytest.mark.parametrize("model_name", ["model-32-bit.pt", "model-64-bit.pt"])
+def test_eval_batch_size(monkeypatch, tmp_path, caplog, model_name, options):
+    """Test that eval via python API runs without an error raise."""
+    monkeypatch.chdir(tmp_path)
+    caplog.set_level(logging.DEBUG)
+
+    shutil.copy(RESOURCES_PATH / "qm9_reduced_100.xyz", "qm9_reduced_100.xyz")
+
+    model = torch.jit.load(RESOURCES_PATH / model_name)
+
+    eval_model(
+        model=model,
+        options=options,
+        output="foo.xyz",
+        batch_size=13,
+        check_consistency=True,
+    )
+
+    # Test target predictions
+    log = "".join([rec.message for rec in caplog.records])
+    assert "energy RMSE (per atom)" in log
+    assert "energy MAE (per atom)" in log
+    assert "dataset with index" not in log
+    assert "evaluation time" in log
+    assert "ms per atom" in log
+    assert "inaccurate average timings" in log
+
+    # Test file is written predictions
+    frames = ase.io.read("foo.xyz", ":")
+    frames[0].info["energy"]
+
+
 def test_eval_export(monkeypatch, tmp_path, options):
     """Test evaluation of a trained model exported but not saved to disk."""
     monkeypatch.chdir(tmp_path)

diff --git a/tests/cli/test_train_model.py b/tests/cli/test_train_model.py
@@ -103,6 +103,7 @@ def test_train(capfd, monkeypatch, tmp_path, output):
     assert "train" in stdout_log
     assert "energy" in stdout_log
     assert "with index" not in stdout_log  # index only printed for more than 1 dataset
+    assert "Running final evaluation with batch size 2" in stdout_log
 
 
 @pytest.mark.parametrize(