NVIDIA · trias702 · Dec 14, 2023 · Dec 5, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ### Breaking changes
 
 ### Bug Fixes
+- `exp_manager.max_time_per_run` is now respected, the trainers will save and run val if we've reached the time limit before exiting.
 
 ## [0.1.0] - 2023-12-04
 ### Added

diff --git a/examples/nlp/gpt/train_gpt_dpo.py b/examples/nlp/gpt/train_gpt_dpo.py
@@ -14,14 +14,15 @@
 from functools import partial
 
 import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf, open_dict
+from omegaconf.omegaconf import OmegaConf
 
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 from nemo_aligner.algorithms.dpo import DPOTrainer, dpo_custom_collate
 from nemo_aligner.data.nlp.builders import build_dataloader, build_train_valid_test_dpo_datasets
 from nemo_aligner.models.nlp.gpt.megatron_gpt_dpo_model import MegatronGPTDPOModel
+from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
     add_custom_checkpoint_callback,
@@ -131,6 +132,7 @@ def main(cfg) -> None:
 
     logger.log_hyperparams(OmegaConf.to_container(cfg))
 
+    timer = Timer(cfg.exp_manager.get("max_time_per_run"))
     dpo_trainer = DPOTrainer(
         cfg=cfg.trainer.dpo,
         model=ptl_model,
@@ -141,6 +143,7 @@ def main(cfg) -> None:
         test_dataloader=None,
         logger=logger,
         ckpt_callback=ckpt_callback,
+        run_timer=timer,
     )
 
     if custom_trainer_state_dict is not None:

diff --git a/examples/nlp/gpt/train_gpt_ppo_actor.py b/examples/nlp/gpt/train_gpt_ppo_actor.py
@@ -29,6 +29,7 @@
 )
 from nemo_aligner.models.nlp.gpt.megatron_gpt_ppo_actor import MegatronGPTActorModel
 from nemo_aligner.models.nlp.gpt.reward_critic_clients import RemoteGPTRMCriticClient
+from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
     add_custom_checkpoint_callback,
@@ -153,6 +154,7 @@ def main(cfg) -> None:
     logger.log_hyperparams(OmegaConf.to_container(cfg))
 
     rm_critic = RemoteGPTRMCriticClient(cfg.remote_critic_rm)
+    timer = Timer(cfg.exp_manager.get("max_time_per_run"))
 
     ppo_trainer = PPOTrainer(
         cfg=cfg.trainer.ppo,
@@ -164,6 +166,7 @@ def main(cfg) -> None:
         rm_critic=rm_critic,
         logger=logger,
         ckpt_callback=ckpt_callback,
+        run_timer=timer,
     )
 
     if custom_trainer_state_dict is not None:

diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
@@ -32,6 +32,7 @@
 from nemo_aligner.algorithms.supervised import SupervisedTrainer
 from nemo_aligner.data.nlp.builders import build_dataloader, build_sft_dataset
 from nemo_aligner.models.nlp.gpt.gpt_sft_model import GPTSFTModel
+from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
     add_custom_checkpoint_callback,
@@ -214,6 +215,7 @@ def main(cfg) -> None:
     ckpt_callback = add_custom_checkpoint_callback(trainer, ptl_model)
 
     logger.log_hyperparams(OmegaConf.to_container(cfg))
+    timer = Timer(cfg.exp_manager.get("max_time_per_run"))
 
     sft_trainer = SupervisedTrainer(
         cfg=cfg.trainer.sft,
@@ -225,6 +227,7 @@ def main(cfg) -> None:
         test_dataloader=None,
         logger=logger,
         ckpt_callback=ckpt_callback,
+        run_timer=timer,
     )
 
     if custom_trainer_state_dict is not None:

diff --git a/examples/nlp/gpt/train_reward_model.py b/examples/nlp/gpt/train_reward_model.py
@@ -24,8 +24,8 @@
     build_train_valid_test_regression_rm_datasets,
     build_train_valid_test_rm_datasets,
 )
-
 from nemo_aligner.models.nlp.gpt.reward_model_classes import REWARD_MODEL_CLASS_DICT, RewardModelType
+from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
     add_custom_checkpoint_callback,
@@ -131,6 +131,8 @@ def main(cfg) -> None:
 
     logger.log_hyperparams(OmegaConf.to_container(cfg))
 
+    timer = Timer(cfg.exp_manager.get("max_time_per_run"))
+
     rm_trainer = SupervisedTrainer(
         cfg=cfg.trainer.rm,
         model=ptl_model,
@@ -141,6 +143,7 @@ def main(cfg) -> None:
         test_dataloader=None,
         logger=logger,
         ckpt_callback=ckpt_callback,
+        run_timer=timer,
     )
 
     if custom_trainer_state_dict is not None:

diff --git a/nemo_aligner/algorithms/dpo.py b/nemo_aligner/algorithms/dpo.py
@@ -23,6 +23,7 @@
     MegatronPretrainingRandomBatchSampler,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.utils import logging
 from nemo_aligner.utils.distributed import SyncTimer
 from nemo_aligner.utils.train_utils import clip_gradients
 from nemo_aligner.utils.trainer_utils import check_progress, compute_limit_batches
@@ -79,6 +80,7 @@ def __init__(
         test_dataloader,
         logger,
         ckpt_callback,
+        run_timer,
     ):
         self.model = model
         self.train_dataloader = train_dataloader
@@ -89,6 +91,9 @@ def __init__(
         self.optimizer = optimizer
         self.scheduler = scheduler
 
+        # this timer checks if we should stop training
+        self.run_timer = run_timer
+
         self.step = 0
         self.epoch = 0
         self.consumed_samples = 0
@@ -188,6 +193,8 @@ def fit(self):
             # epoch done
             return
 
+        self.run_timer.start_time()
+
         for _ in epoch_iter:
             loop_iter = range(self.step, self.max_steps)
 
@@ -223,12 +230,14 @@ def fit(self):
 
                 self.step += 1
 
+                run_time_exceeded = self.run_timer.is_finished()
                 run_val, save_model, is_train_end = check_progress(
                     self.step,
                     self.max_steps,
                     self.cfg.val_check_interval,
                     self.cfg.save_interval,
                     self.limit_val_batches,
+                    run_time_exceeded=run_time_exceeded,
                 )
 
                 if run_val:
@@ -246,6 +255,10 @@ def fit(self):
                     metrics = {k: torch.as_tensor(v) for k, v in metrics.items()}
                     self.save(metrics, is_train_end=is_train_end)
 
+                if run_time_exceeded:
+                    logging.info(f"Time limit given by run_timer={self.run_timer} reached. Stopping run")
+                    return
+
                 metrics.clear()
 
             self.epoch += 1

diff --git a/nemo_aligner/algorithms/ppo.py b/nemo_aligner/algorithms/ppo.py
@@ -22,6 +22,7 @@
 from tqdm import tqdm
 
 from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
+from nemo.utils import logging
 from nemo_aligner.utils.distributed import (
     SyncTimer,
     masked_global_mean_var,
@@ -36,6 +37,7 @@
 )
 from nemo_aligner.utils.server_utils import FutureResult
 from nemo_aligner.utils.train_utils import clip_gradients
+from nemo_aligner.utils.trainer_utils import check_progress
 from nemo_aligner.utils.utils import clear_memory, cpu_dict, masked_mean
 
 
@@ -61,6 +63,7 @@ def __init__(
         rm_critic,
         logger,
         ckpt_callback,
+        run_timer,
     ):
         self.cfg = cfg
         self.model = model
@@ -72,6 +75,9 @@ def __init__(
         self.logger = logger
         self.ckpt_callback = ckpt_callback
 
+        # this timer checks if we should stop training
+        self.run_timer = run_timer
+
         self.consumed_samples = 0
         self.epoch = 0
         # the step here is PPO step
@@ -95,10 +101,6 @@ def __init__(
             reduction="mean", sync_cuda=True, buffer_size=1, reduce_op=torch.distributed.ReduceOp.MAX
         )
 
-        assert (
-            self.cfg.save_interval % self.cfg.val_check_interval == 0
-        ), f"{self.cfg.save_interval=} must be divisible by {self.cfg.val_check_interval=}"
-
     def generate_ppo_data(self, rollout_batches):
         """generate ppo specific data for training
         """
@@ -372,6 +374,7 @@ def fit(self):
 
             num_to_load_on_each_dp = divide(self.cfg.model_gbs, dp_size)
 
+            self.run_timer.start_time()
             for _ in global_pbar:
                 step_metrics = {}
                 timing_metrics = {}
@@ -412,8 +415,16 @@ def fit(self):
 
                 self.step += 1
 
-                is_train_end = self.step == self.max_steps
-                run_val = (self.step % self.cfg.val_check_interval == 0) or is_train_end
+                run_time_exceeded = self.run_timer.is_finished()
+                run_val, save_model, is_train_end = check_progress(
+                    self.step,
+                    self.max_steps,
+                    self.cfg.val_check_interval,
+                    self.cfg.save_interval,
+                    1.0,  # TODO:(geshen): allow for limit val batches
+                    run_time_exceeded=run_time_exceeded,
+                )
+
                 if run_val:
                     self.timer.start("validation_time")
                     val_metrics = self.run_validation()
@@ -439,10 +450,14 @@ def fit(self):
                 step_metrics.update({f"train_{k}": v for k, v in metrics.items()})
                 global_pbar.set_postfix(step_metrics)
 
-                step_metrics = {k: torch.as_tensor(v) for k, v in step_metrics.items()}
-                if run_val and (self.step % self.cfg.save_interval == 0 or is_train_end):
+                if save_model:
+                    step_metrics = {k: torch.as_tensor(v) for k, v in step_metrics.items()}
                     self.save(step_metrics, is_train_end=is_train_end)
 
+                if run_time_exceeded:
+                    logging.info(f"Time limit given by run_timer={self.run_timer} reached. Stopping run")
+                    return
+
             self.epoch += 1
 
         self.logger.finalize()

diff --git a/nemo_aligner/algorithms/supervised.py b/nemo_aligner/algorithms/supervised.py
@@ -18,6 +18,8 @@
 import torch
 from omegaconf.dictconfig import DictConfig
 from tqdm import tqdm
+from nemo.utils import logging
+
 
 from nemo_aligner.utils.distributed import SyncTimer
 from nemo_aligner.utils.train_utils import clip_gradients
@@ -40,6 +42,7 @@ def __init__(
         test_dataloader,
         logger,
         ckpt_callback,
+        run_timer,
     ):
         self.model = model
         self.train_dataloader = train_dataloader
@@ -50,6 +53,9 @@ def __init__(
         self.optimizer = optimizer
         self.scheduler = scheduler
 
+        # this timer checks if we should stop training
+        self.run_timer = run_timer
+
         self.step = 0
         self.epoch = 0
         self.consumed_samples = 0
@@ -138,6 +144,8 @@ def fit(self):
             # epoch done
             return
 
+        self.run_timer.start_time()
+
         for _ in epoch_iter:
             loop_iter = range(self.step, self.max_steps)
 
@@ -166,12 +174,14 @@ def fit(self):
 
                 self.step += 1
 
+                run_time_exceeded = self.run_timer.is_finished()
                 run_val, save_model, is_train_end = check_progress(
                     self.step,
                     self.max_steps,
                     self.cfg.val_check_interval,
                     self.cfg.save_interval,
                     self.limit_val_batches,
+                    run_time_exceeded=run_time_exceeded,
                 )
 
                 if run_val:
@@ -189,6 +199,10 @@ def fit(self):
                     metrics = {k: torch.as_tensor(v) for k, v in metrics.items()}
                     self.save(metrics, is_train_end=is_train_end)
 
+                if run_time_exceeded:
+                    logging.info(f"Time limit given by run_timer={self.run_timer} reached. Stopping run")
+                    return
+
                 metrics.clear()
 
             self.epoch += 1

diff --git a/nemo_aligner/utils/distributed.py b/nemo_aligner/utils/distributed.py
@@ -14,8 +14,12 @@
 
 """distributed utils for communicating between different ranks"""
 
+import time
 import warnings
 from collections import defaultdict
+from dataclasses import dataclass
+from datetime import timedelta
+from typing import Dict, Optional, Union
 
 import torch
 from megatron.core import parallel_state, tensor_parallel
@@ -320,3 +324,37 @@ def sync_and_consume_over_stored_time(self, name=""):
         yield from output_list
 
         del self.stored_results[name]
+
+
+@dataclass
+class Timer:
+    """Timer to tell us when the time limit is reached
+    """
+
+    duration: Optional[str]
+
+    def __post_init__(self):
+        self._duration = float("inf")
+
+        if self.duration is not None:
+            days, hours, mins, seconds = map(int, self.duration.strip().split(":"))
+            self._duration = timedelta(days=days, hours=hours, minutes=mins, seconds=seconds).total_seconds()
+
+    def start_time(self):
+        self._start_time = time.monotonic()
+
+    def get_time_elapsed(self):
+        return time.monotonic() - self._start_time
+
+    def get_time_remaining(self):
+        return self._duration - self.get_time_elapsed()
+
+    def is_finished(self):
+        time_left = self.get_time_remaining()
+
+        is_finished = time_left <= 0
+        is_finished_tensor = torch.tensor([is_finished], dtype=torch.bool, device="cuda")
+
+        # only respect rank 0 timing
+        torch.distributed.broadcast(is_finished_tensor, 0)
+        return is_finished_tensor.item()