From 8626fd55d4262d0ab5e36c0c3a21937551710d57 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Thu, 19 Dec 2024 17:28:20 -0800
Subject: [PATCH] Add exclusive flag

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/pipeline/check_contamination.py | 2 ++
 nemo_skills/pipeline/convert.py             | 4 +++-
 nemo_skills/pipeline/eval.py                | 2 ++
 nemo_skills/pipeline/generate.py            | 3 +++
 nemo_skills/pipeline/start_server.py        | 2 ++
 nemo_skills/pipeline/train.py               | 3 +++
 6 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/pipeline/check_contamination.py b/nemo_skills/pipeline/check_contamination.py
index 416472f81..88ea8583f 100644
--- a/nemo_skills/pipeline/check_contamination.py
+++ b/nemo_skills/pipeline/check_contamination.py
@@ -85,6 +85,7 @@ def check_contamination(
         help="Can specify a custom location for slurm logs. "
         "If not specified, will be inside `ssh_tunnel.job_dir` part of your cluster config.",
     ),
+    exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"),
 ):
     """Check contamination between train/test via an LLM call.
 
@@ -149,6 +150,7 @@ def check_contamination(
                 task_dependencies=prev_tasks,
                 run_after=run_after,
                 reuse_code_exp=reuse_code_exp,
+                slurm_kwargs={"exclusive": exclusive} if exclusive else None,
             )
             prev_tasks = [new_task]
         run_exp(exp, cluster_config)
diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py
index c90b4f43f..61098be8d 100755
--- a/nemo_skills/pipeline/convert.py
+++ b/nemo_skills/pipeline/convert.py
@@ -178,7 +178,8 @@ def convert(
         "Can provide an experiment name or an experiment object if running from code.",
     ),
     config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
-    log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
+    log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs."),
+    exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"),
 ):
     """Convert a checkpoint from one format to another.
 
@@ -252,6 +253,7 @@ def convert(
             time_min=time_min,
             run_after=run_after,
             reuse_code_exp=reuse_code_exp,
+            slurm_kwargs={"exclusive": exclusive} if exclusive else None,
         )
         run_exp(exp, cluster_config)
 
diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
index 9d2020974..c1864b7a5 100644
--- a/nemo_skills/pipeline/eval.py
+++ b/nemo_skills/pipeline/eval.py
@@ -125,6 +125,7 @@ def eval(
         help="Path to a custom dataset folder that will be searched in addition to the main one. "
         "Can also specify through NEMO_SKILLS_EXTRA_DATASETS.",
     ),
+    exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"),
 ):
     """Evaluate a model on specified benchmarks.
 
@@ -230,6 +231,7 @@ def eval(
                 reuse_code_exp=reuse_code_exp,
                 extra_package_dirs=[extra_datasets] if extra_datasets else None,
                 get_server_command=get_server_command,
+                slurm_kwargs={"exclusive": exclusive} if exclusive else None,
             )
         run_exp(exp, cluster_config)
 
diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index 4e5a2662a..597412000 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -198,6 +198,7 @@ def generate(
     ),
     config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
     log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs."),
+    exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"),
 ):
     """Generate LLM completions for a given input file.
 
@@ -274,6 +275,7 @@ def generate(
                         reuse_code_exp=reuse_code_exp,
                         task_dependencies=prev_tasks,
                         get_server_command=get_server_command,
+                        slurm_kwargs={"exclusive": exclusive} if exclusive else None,
                     )
                     prev_tasks = [new_task]
         else:
@@ -316,6 +318,7 @@ def generate(
                     reuse_code_exp=reuse_code_exp,
                     task_dependencies=prev_tasks,
                     get_server_command=get_server_command,
+                    slurm_kwargs={"exclusive": exclusive} if exclusive else None,
                 )
                 prev_tasks = [new_task]
         run_exp(exp, cluster_config)
diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py
index dc17c892d..a6c09e617 100644
--- a/nemo_skills/pipeline/start_server.py
+++ b/nemo_skills/pipeline/start_server.py
@@ -52,6 +52,7 @@ def start_server(
         help="Can specify a custom location for slurm logs. "
         "If not specified, will be inside `ssh_tunnel.job_dir` part of your cluster config.",
     ),
+    exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"),
 ):
     """Self-host a model server."""
     setup_logging(disable_hydra_logs=False)
@@ -87,6 +88,7 @@ def start_server(
             time_min=time_min,
             server_config=server_config,
             with_sandbox=with_sandbox,
+            slurm_kwargs={"exclusive": exclusive} if exclusive else None,
         )
         # we don't want to detach in this case even on slurm, so not using run_exp
         exp.run(detach=False, tail_logs=True)
diff --git a/nemo_skills/pipeline/train.py b/nemo_skills/pipeline/train.py
index 7c06d0a32..03b13f817 100755
--- a/nemo_skills/pipeline/train.py
+++ b/nemo_skills/pipeline/train.py
@@ -240,6 +240,7 @@ def train(
     ),
     config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
     log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
+    exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"),
 ):
     """Train (SFT or DPO) an LLM model.
 
@@ -316,6 +317,7 @@ def train(
                 run_after=run_after,
                 reuse_code_exp=reuse_code_exp,
                 task_dependencies=[prev_task] if prev_task is not None else None,
+                slurm_kwargs={"exclusive": exclusive} if exclusive else None,
             )
 
         cmd = get_avg_checkpoints_cmd(
@@ -340,6 +342,7 @@ def train(
             run_after=run_after,
             reuse_code_exp=reuse_code_exp,
             task_dependencies=[prev_task] if prev_task is not None else None,
+            slurm_kwargs={"exclusive": exclusive} if exclusive else None,
         )
 
         # explicitly setting sequential to False since we set dependencies directly