From 5a03d2696d50b0dba0ebe651347a9657e17c807d Mon Sep 17 00:00:00 2001 From: Hongxin Liu Date: Thu, 14 Nov 2024 18:10:37 +0800 Subject: [PATCH] [cli] support run as module option (#6135) --- colossalai/cli/launcher/__init__.py | 27 ++++++++++++++++++++++----- colossalai/cli/launcher/run.py | 9 ++++++++- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/colossalai/cli/launcher/__init__.py b/colossalai/cli/launcher/__init__.py index 0f9ead6495db..99d87948cb5f 100644 --- a/colossalai/cli/launcher/__init__.py +++ b/colossalai/cli/launcher/__init__.py @@ -64,7 +64,8 @@ "This will be converted to --arg1=1 --arg2=2 during execution", ) @click.option("--ssh-port", type=int, default=None, help="(optional) the port used for ssh connection") -@click.argument("user_script", type=str) +@click.option("-m", type=str, default=None, help="run library module as a script (terminates option list)") +@click.argument("user_script", type=str, required=False, default=None) @click.argument("user_args", nargs=-1) def run( host: str, @@ -77,8 +78,9 @@ def run( master_port: int, extra_launch_args: str, ssh_port: int, + m: str, user_script: str, - user_args: str, + user_args: tuple, ) -> None: """ To launch multiple processes on a single node or multiple nodes via command line. @@ -102,9 +104,24 @@ def run( # run with hostfile excluding the hosts selected colossalai run --hostfile --master_addr host1 --exclude host2 --nprocs_per_node 4 train.py """ - if not user_script.endswith(".py"): - click.echo(f"Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help") - exit() + if m is not None: + if m.endswith(".py"): + click.echo(f"Error: invalid Python module {m}. Did you use a wrong option? Try colossalai run --help") + exit() + if user_script is not None: + user_args = (user_script,) + user_args + user_script = m + m = True + else: + if user_script is None: + click.echo("Error: missing script argument. Did you use a wrong option? Try colossalai run --help") + exit() + if not user_script.endswith(".py"): + click.echo( + f"Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help" + ) + exit() + m = False args_dict = locals() args = Config(args_dict) diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py index 88f70f02ec27..45b1056fdd5e 100644 --- a/colossalai/cli/launcher/run.py +++ b/colossalai/cli/launcher/run.py @@ -113,6 +113,7 @@ def get_launch_command( user_args: List[str], node_rank: int, num_nodes: int, + run_as_module: bool, extra_launch_args: str = None, ) -> str: """ @@ -155,6 +156,8 @@ def _arg_dict_to_list(arg_dict): torch_version = version.parse(torch.__version__) assert torch_version.major >= 1 + if torch_version.major < 2 and run_as_module: + raise ValueError("Torch version < 2.0 does not support running as module") if torch_version.major == 1 and torch_version.minor < 9: # torch distributed launch cmd with torch < 1.9 @@ -198,7 +201,10 @@ def _arg_dict_to_list(arg_dict): ] cmd += _arg_dict_to_list(default_torchrun_rdzv_args) - cmd += _arg_dict_to_list(extra_launch_args) + [user_script] + user_args + cmd += _arg_dict_to_list(extra_launch_args) + if run_as_module: + cmd.append("-m") + cmd += [user_script] + user_args cmd = " ".join(cmd) return cmd @@ -294,6 +300,7 @@ def launch_multi_processes(args: Config) -> None: user_args=args.user_args, node_rank=node_id, num_nodes=len(active_device_pool), + run_as_module=args.m, extra_launch_args=args.extra_launch_args, ) runner.send(hostinfo=hostinfo, cmd=cmd)