From 8626fd55d4262d0ab5e36c0c3a21937551710d57 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Thu, 19 Dec 2024 17:28:20 -0800 Subject: [PATCH] Add exclusive flag Signed-off-by: Igor Gitman --- nemo_skills/pipeline/check_contamination.py | 2 ++ nemo_skills/pipeline/convert.py | 4 +++- nemo_skills/pipeline/eval.py | 2 ++ nemo_skills/pipeline/generate.py | 3 +++ nemo_skills/pipeline/start_server.py | 2 ++ nemo_skills/pipeline/train.py | 3 +++ 6 files changed, 15 insertions(+), 1 deletion(-) diff --git a/nemo_skills/pipeline/check_contamination.py b/nemo_skills/pipeline/check_contamination.py index 416472f81..88ea8583f 100644 --- a/nemo_skills/pipeline/check_contamination.py +++ b/nemo_skills/pipeline/check_contamination.py @@ -85,6 +85,7 @@ def check_contamination( help="Can specify a custom location for slurm logs. " "If not specified, will be inside `ssh_tunnel.job_dir` part of your cluster config.", ), + exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"), ): """Check contamination between train/test via an LLM call. @@ -149,6 +150,7 @@ def check_contamination( task_dependencies=prev_tasks, run_after=run_after, reuse_code_exp=reuse_code_exp, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) prev_tasks = [new_task] run_exp(exp, cluster_config) diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py index c90b4f43f..61098be8d 100755 --- a/nemo_skills/pipeline/convert.py +++ b/nemo_skills/pipeline/convert.py @@ -178,7 +178,8 @@ def convert( "Can provide an experiment name or an experiment object if running from code.", ), config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"), - log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "), + log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs."), + exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"), ): """Convert a checkpoint from one format to another. @@ -252,6 +253,7 @@ def convert( time_min=time_min, run_after=run_after, reuse_code_exp=reuse_code_exp, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) run_exp(exp, cluster_config) diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py index 9d2020974..c1864b7a5 100644 --- a/nemo_skills/pipeline/eval.py +++ b/nemo_skills/pipeline/eval.py @@ -125,6 +125,7 @@ def eval( help="Path to a custom dataset folder that will be searched in addition to the main one. " "Can also specify through NEMO_SKILLS_EXTRA_DATASETS.", ), + exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"), ): """Evaluate a model on specified benchmarks. @@ -230,6 +231,7 @@ def eval( reuse_code_exp=reuse_code_exp, extra_package_dirs=[extra_datasets] if extra_datasets else None, get_server_command=get_server_command, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) run_exp(exp, cluster_config) diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index 4e5a2662a..597412000 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -198,6 +198,7 @@ def generate( ), config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"), log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs."), + exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"), ): """Generate LLM completions for a given input file. @@ -274,6 +275,7 @@ def generate( reuse_code_exp=reuse_code_exp, task_dependencies=prev_tasks, get_server_command=get_server_command, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) prev_tasks = [new_task] else: @@ -316,6 +318,7 @@ def generate( reuse_code_exp=reuse_code_exp, task_dependencies=prev_tasks, get_server_command=get_server_command, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) prev_tasks = [new_task] run_exp(exp, cluster_config) diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py index dc17c892d..a6c09e617 100644 --- a/nemo_skills/pipeline/start_server.py +++ b/nemo_skills/pipeline/start_server.py @@ -52,6 +52,7 @@ def start_server( help="Can specify a custom location for slurm logs. " "If not specified, will be inside `ssh_tunnel.job_dir` part of your cluster config.", ), + exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"), ): """Self-host a model server.""" setup_logging(disable_hydra_logs=False) @@ -87,6 +88,7 @@ def start_server( time_min=time_min, server_config=server_config, with_sandbox=with_sandbox, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) # we don't want to detach in this case even on slurm, so not using run_exp exp.run(detach=False, tail_logs=True) diff --git a/nemo_skills/pipeline/train.py b/nemo_skills/pipeline/train.py index 7c06d0a32..03b13f817 100755 --- a/nemo_skills/pipeline/train.py +++ b/nemo_skills/pipeline/train.py @@ -240,6 +240,7 @@ def train( ), config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"), log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "), + exclusive: bool = typer.Option(False, help="If True, will use --exclusive flag for slurm"), ): """Train (SFT or DPO) an LLM model. @@ -316,6 +317,7 @@ def train( run_after=run_after, reuse_code_exp=reuse_code_exp, task_dependencies=[prev_task] if prev_task is not None else None, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) cmd = get_avg_checkpoints_cmd( @@ -340,6 +342,7 @@ def train( run_after=run_after, reuse_code_exp=reuse_code_exp, task_dependencies=[prev_task] if prev_task is not None else None, + slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) # explicitly setting sequential to False since we set dependencies directly