metatensor · abmazitov · Nov 25, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/docs/src/advanced-concepts/fine-tuning.rst b/docs/src/advanced-concepts/fine-tuning.rst
@@ -48,31 +48,17 @@ Prerequisites
 
 1. Train the Base Model. You can train the base model using the command:
 ``mtt train options.yaml``. Alternatively, you can use a pre-trained
-foundational model, if you have access to its state dict.
+foundational model, if you have access to its checkpoint. After this training,
+you will find the checkpoint file called ``best_model.ckpt`` in the training
+directory.
 
-2. Define Paths in ``options.yaml``. Specify the paths to ``model_state_dict``,
-``all_species.npy``, and ``self_contributions.npy`` in the ``training``
-section of the ``options.yaml`` file:
-
-.. code-block:: yaml
-
-  training:
-    MODEL_TO_START_WITH: <path_to_model_state_dict>
-    ALL_SPECIES_PATH: <path_to_all_species.npy>
-    SELF_CONTRIBUTIONS_PATH: <path_to_self_contributions.npy>
-
-These parameters are relevant for the outputs of the PET model. If you are
-not familiar with their meaning, please refer to the :ref:`architecture-pet`
-model documentation.
-
-
-3. Set the LoRA parameters in the ``architecture.model``
+2. Set the LoRA parameters in the ``architecture.training``
 section of the ``options.yaml``:
 
 .. code-block:: yaml
 
   architecture:
-    model:
+    training:
       LORA_RANK: <desired_rank>
       LORA_ALPHA: <desired_alpha>
       USE_LORA_PEFT: True
@@ -82,7 +68,8 @@ These parameters control whether to use LoRA for pre-trained model fine-tuning
 (``LORA_RANK``), and the regularization factor for the low-rank matrices
 (``LORA_ALPHA``).
 
-4. Run ``mtt train options.yaml`` to fine-tune the model.
+4. Run ``mtt train options.yaml -c best_model.ckpt`` to fine-tune the model.
+The ``-c`` flag specifies the path to the pre-trained model checkpoint.
 
 Fine-Tuning Options
 ^^^^^^^^^^^^^^^^^^^

@@ -34,11 +34,11 @@ architecture:
     TARGET_INDEX_KEY: target_index
     RESIDUAL_FACTOR: 0.5
     USE_ZBL: False
+
+  training:
     USE_LORA_PEFT: False 
     LORA_RANK: 4
     LORA_ALPHA: 0.5
-
-  training:
     INITIAL_LR: 1e-4
     EPOCH_NUM_ATOMIC: 1000000000
     EPOCHS_WARMUP_ATOMIC: 100000000

@@ -13,16 +13,13 @@
     NeighborListOptions,
     System,
 )
-from pet.hypers import Hypers
 from pet.pet import PET as RawPET
-from pet.pet import SelfContributionsWrapper
 
 from metatrain.utils.data import DatasetInfo
 
 from ...utils.additive import ZBL
 from ...utils.dtype import dtype_to_str
-from .utils import systems_to_batch_dict, update_state_dict
-from .utils.fine_tuning import LoRAWrapper
+from .utils import load_raw_pet_model, systems_to_batch_dict
 
 
 logger = logging.getLogger(__name__)
@@ -48,6 +45,7 @@ def __init__(self, model_hypers: Dict, dataset_info: DatasetInfo) -> None:
         self.atomic_types: List[int] = dataset_info.atomic_types
         self.dataset_info = dataset_info
         self.pet = None
+        self.is_lora_applied = False
         self.checkpoint_path: Optional[str] = None
 
         # additive models: these are handled by the trainer at training
@@ -58,10 +56,30 @@ def __init__(self, model_hypers: Dict, dataset_info: DatasetInfo) -> None:
         self.additive_models = torch.nn.ModuleList(additive_models)
 
     def restart(self, dataset_info: DatasetInfo) -> "PET":
-        if dataset_info != self.dataset_info:
+        merged_info = self.dataset_info.union(dataset_info)
+        new_atomic_types = [
+            at for at in merged_info.atomic_types if at not in self.atomic_types
+        ]
+        new_targets = {
+            key: value
+            for key, value in merged_info.targets.items()
+            if key not in self.dataset_info.targets
+        }
+
+        if len(new_atomic_types) > 0:
+            raise ValueError(
+                f"New atomic types found in the dataset: {new_atomic_types}. "
+                "The PET model does not support adding new atomic types."
+            )
+
+        if len(new_targets) > 0:
             raise ValueError(
-                "PET cannot be restarted with different dataset information"
+                f"New targets found in the training options: {new_targets}. "
+                "The PET model does not support adding new training targets."
             )
+
+        self.dataset_info = merged_info
+        self.atomic_types = sorted(self.atomic_types)
         return self
 
     def set_trained_model(self, trained_model: RawPET) -> None:
@@ -140,29 +158,25 @@ def load_checkpoint(cls, path: Union[str, Path]) -> "PET":
 
         checkpoint = torch.load(path, weights_only=False, map_location="cpu")
         hypers = checkpoint["hypers"]
+        model_hypers = hypers["ARCHITECTURAL_HYPERS"]
         dataset_info = checkpoint["dataset_info"]
-        model = cls(
-            model_hypers=hypers["ARCHITECTURAL_HYPERS"], dataset_info=dataset_info
+        model = cls(model_hypers=model_hypers, dataset_info=dataset_info)
+        state_dict = checkpoint["model_state_dict"]
+        dtype = next(iter(state_dict.values())).dtype
+        lora_state_dict = checkpoint["lora_state_dict"]
+        if lora_state_dict is not None:
+            model.is_lora_applied = True
+        else:
+            lora_state_dict = {}
+        wrapper = load_raw_pet_model(
+            state_dict,
+            model.hypers,
+            model.atomic_types,
+            checkpoint["self_contributions"],
+            use_lora_peft=model.is_lora_applied,
+            **lora_state_dict,
         )
 
-        checkpoint = torch.load(path, weights_only=False)
-        state_dict = checkpoint["checkpoint"]["model_state_dict"]
-
-        ARCHITECTURAL_HYPERS = Hypers(model.hypers)
-        raw_pet = RawPET(ARCHITECTURAL_HYPERS, 0.0, len(model.atomic_types))
-        if ARCHITECTURAL_HYPERS.USE_LORA_PEFT:
-            lora_rank = ARCHITECTURAL_HYPERS.LORA_RANK
-            lora_alpha = ARCHITECTURAL_HYPERS.LORA_ALPHA
-            raw_pet = LoRAWrapper(raw_pet, lora_rank, lora_alpha)
-
-        new_state_dict = update_state_dict(state_dict)
-
-        dtype = next(iter(new_state_dict.values())).dtype
-        raw_pet.to(dtype).load_state_dict(new_state_dict)
-
-        self_contributions = checkpoint["self_contributions"]
-        wrapper = SelfContributionsWrapper(raw_pet, self_contributions)
-
         model.to(dtype).set_trained_model(wrapper)
 
         return model

@@ -126,7 +126,13 @@
         },
         "USE_ZBL": {
           "type": "boolean"
-        },
+        }
+      },
+      "additionalProperties": false
+    },
+    "training": {
+      "type": "object",
+      "properties": {
         "USE_LORA_PEFT": {
           "type": "boolean"
         },
@@ -135,13 +141,7 @@
         },
         "LORA_ALPHA": {
           "type": "number"
-        }
-      },
-      "additionalProperties": false
-    },
-    "training": {
-      "type": "object",
-      "properties": {
+        },
         "INITIAL_LR": {
           "type": "number"
         },