Skip to content

Commit

Permalink
check port availability only in main deepspeed/torchrun launcher (#2078)
Browse files Browse the repository at this point in the history
* check port availability only in main deepspeed launcher

* check port availability only in main launcher for deepspeed/torchrun

* Update launch.py

add comments

---------

Co-authored-by: 聂靖入 <[email protected]>
  • Loading branch information
Jingru and 聂靖入 authored Nov 17, 2023
1 parent 99877f5 commit cf745c9
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/accelerate/utils/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,10 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
if main_process_port is None:
main_process_port = 29500

if is_port_in_use(main_process_port):
# only need to check port availability in main process, in case we have to start multiple launchers on the same machine
# for some reasons like splitting log files.
need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
if need_port_check and is_port_in_use(main_process_port):
raise ConnectionError(
f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
"Please specify a different port (such as using the `----main_process_port` flag or specifying a different `main_process_port` in your config file)"
Expand Down Expand Up @@ -272,7 +275,10 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
if main_process_port is None:
main_process_port = 29500

if is_port_in_use(main_process_port):
# only need to check port availability in main process, in case we have to start multiple launchers on the same machine
# for some reasons like splitting log files.
need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
if need_port_check and is_port_in_use(main_process_port):
raise ConnectionError(
f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
"Please specify a different port (such as using the `----main_process_port` flag or specifying a different `main_process_port` in your config file)"
Expand Down

0 comments on commit cf745c9

Please sign in to comment.