From cb53dd1f63127dc15f8f87ddeb609f9b63eae2f7 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Fri, 27 Oct 2023 13:39:55 +0200 Subject: [PATCH 01/20] Updated torchrun instructions --- examples/README.md | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/examples/README.md b/examples/README.md index 06e06db0f8c..e0ebed846d4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -66,7 +66,7 @@ To run it in each of these various modes, use the following commands: ``` * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch) ```bash - python -m torchrun --nproc_per_node 2 --use_env ./nlp_example.py + torchrun --nproc_per_node 2 ./nlp_example.py ``` - multi GPUs, multi node (several machines, using PyTorch distributed mode) * With Accelerate config and launcher, on each machine: @@ -76,16 +76,19 @@ To run it in each of these various modes, use the following commands: ``` * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch) ```bash - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 0 \ - --master_addr master_node_ip_address \ - ./nlp_example.py # On the first server - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 1 \ - --master_addr master_node_ip_address \ - ./nlp_example.py # On the second server + torchrun --nproc_per_node 2 \ + --nnodes 2 + --rdzv_id 2299 \ # A unique job id + --rdzv_backend c10d \ + --rdzv_endpoint master_node_ip_address:29500 \ + ./nlp_example.py # On the first server + + torchrun --nproc_per_node 2 \ + --nnodes 2 + --rdzv_id 2299 \ # A unique job id + --rdzv_backend c10d \ + --rdzv_endpoint master_node_ip_address:29500 \ + ./nlp_example.py # On the second server ``` - (multi) TPUs * With Accelerate config and launcher @@ -154,7 +157,7 @@ To run it in each of these various modes, use the following commands: ``` * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch) ```bash - python -m torchrun --nproc_per_node 2 --use_env ./cv_example.py --data_dir path_to_data + torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data ``` - multi GPUs, multi node (several machines, using PyTorch distributed mode) * With Accelerate config and launcher, on each machine: @@ -164,15 +167,18 @@ To run it in each of these various modes, use the following commands: ``` * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch) ```bash - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 0 \ - --master_addr master_node_ip_address \ + torchrun --nproc_per_node 2 \ + --nnodes 2 + --rdzv_id 2299 \ # A unique job id + --rdzv_backend c10d \ + --rdzv_endpoint master_node_ip_address:29500 \ ./cv_example.py --data_dir path_to_data # On the first server - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 1 \ - --master_addr master_node_ip_address \ + + torchrun --nproc_per_node 2 \ + --nnodes 2 + --rdzv_id 2299 \ # A unique job id + --rdzv_backend c10d \ + --rdzv_endpoint master_node_ip_address:29500 \ ./cv_example.py --data_dir path_to_data # On the second server ``` - (multi) TPUs From 233ff9ebe477446f61a903f362e7c4cef70a112b Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:49:08 +0200 Subject: [PATCH 02/20] Update examples/README.md Co-authored-by: Benjamin Bossan --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index e0ebed846d4..a54757ba267 100644 --- a/examples/README.md +++ b/examples/README.md @@ -77,7 +77,7 @@ To run it in each of these various modes, use the following commands: * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch) ```bash torchrun --nproc_per_node 2 \ - --nnodes 2 + --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ --rdzv_endpoint master_node_ip_address:29500 \ From d83a084a951f4543fe90caa683ce13e2a41c3214 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:49:17 +0200 Subject: [PATCH 03/20] Update examples/README.md Co-authored-by: Benjamin Bossan --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index a54757ba267..e17db84f615 100644 --- a/examples/README.md +++ b/examples/README.md @@ -84,7 +84,7 @@ To run it in each of these various modes, use the following commands: ./nlp_example.py # On the first server torchrun --nproc_per_node 2 \ - --nnodes 2 + --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ --rdzv_endpoint master_node_ip_address:29500 \ From 824b3d8d9a58507a582cede3357c7beef20425c5 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:49:43 +0200 Subject: [PATCH 04/20] Update examples/README.md Co-authored-by: Benjamin Bossan --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index e17db84f615..8c266ca3d0c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -168,7 +168,7 @@ To run it in each of these various modes, use the following commands: * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch) ```bash torchrun --nproc_per_node 2 \ - --nnodes 2 + --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ --rdzv_endpoint master_node_ip_address:29500 \ From 9604aa1f584403b54da131a9f7b200dfa5cd1db4 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:50:08 +0200 Subject: [PATCH 05/20] Update examples/README.md Co-authored-by: Benjamin Bossan --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index 8c266ca3d0c..ebee4ef612e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -175,7 +175,7 @@ To run it in each of these various modes, use the following commands: ./cv_example.py --data_dir path_to_data # On the first server torchrun --nproc_per_node 2 \ - --nnodes 2 + --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ --rdzv_endpoint master_node_ip_address:29500 \ From b9665bb197ef2f0cbb6a0ae876a56eb70b32088c Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:11:08 +0200 Subject: [PATCH 06/20] Update README.md for torchrun instructions --- examples/README.md | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/examples/README.md b/examples/README.md index ebee4ef612e..2391c4df46d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -64,7 +64,7 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on your server accelerate launch ./nlp_example.py # This will run the script on your server ``` - * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch) + * With traditional PyTorch launcher ```bash torchrun --nproc_per_node 2 ./nlp_example.py ``` @@ -74,21 +74,14 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./nlp_example.py # This will run the script on each server ``` - * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch) + * With PyTorch launcher only. Run this commnad on each node: ```bash torchrun --nproc_per_node 2 \ --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ --rdzv_endpoint master_node_ip_address:29500 \ - ./nlp_example.py # On the first server - - torchrun --nproc_per_node 2 \ - --nnodes 2 \ - --rdzv_id 2299 \ # A unique job id - --rdzv_backend c10d \ - --rdzv_endpoint master_node_ip_address:29500 \ - ./nlp_example.py # On the second server + ./nlp_example.py ``` - (multi) TPUs * With Accelerate config and launcher @@ -155,7 +148,7 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on your server accelerate launch ./cv_example.py --data_dir path_to_data # This will run the script on your server ``` - * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch) + * With traditional PyTorch launcher ```bash torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data ``` @@ -165,21 +158,14 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./cv_example.py --data_dir path_to_data # This will run the script on each server ``` - * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch) + * With PyTorch launcher only. Run this commnad on each node: ```bash torchrun --nproc_per_node 2 \ --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ --rdzv_endpoint master_node_ip_address:29500 \ - ./cv_example.py --data_dir path_to_data # On the first server - - torchrun --nproc_per_node 2 \ - --nnodes 2 \ - --rdzv_id 2299 \ # A unique job id - --rdzv_backend c10d \ - --rdzv_endpoint master_node_ip_address:29500 \ - ./cv_example.py --data_dir path_to_data # On the second server + ./cv_example.py --data_dir path_to_data ``` - (multi) TPUs * With Accelerate config and launcher From 64954daf054fa755ffa4c302292cc8078142b184 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Tue, 31 Oct 2023 20:00:42 +0100 Subject: [PATCH 07/20] Added SLURM scripts and updated README --- examples/README.md | 21 ++++++++++----- examples/Slurm/submit-multiGPU.sh | 25 +++++++++++++++++ examples/Slurm/submit-multinode.sh | 43 ++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 examples/Slurm/submit-multiGPU.sh create mode 100644 examples/Slurm/submit-multinode.sh diff --git a/examples/README.md b/examples/README.md index 2391c4df46d..5cdd4182437 100644 --- a/examples/README.md +++ b/examples/README.md @@ -64,7 +64,7 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on your server accelerate launch ./nlp_example.py # This will run the script on your server ``` - * With traditional PyTorch launcher + * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`) ```bash torchrun --nproc_per_node 2 ./nlp_example.py ``` @@ -74,9 +74,10 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./nlp_example.py # This will run the script on each server ``` - * With PyTorch launcher only. Run this commnad on each node: + * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node: ```bash - torchrun --nproc_per_node 2 \ + torchrun \ # python -m torch.distributed.run + --nproc_per_node 2 \ --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ @@ -148,7 +149,7 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on your server accelerate launch ./cv_example.py --data_dir path_to_data # This will run the script on your server ``` - * With traditional PyTorch launcher + * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`) ```bash torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data ``` @@ -158,9 +159,10 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./cv_example.py --data_dir path_to_data # This will run the script on each server ``` - * With PyTorch launcher only. Run this commnad on each node: + * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node: ```bash - torchrun --nproc_per_node 2 \ + torchrun \ # python -m torch.distributed.run + --nproc_per_node 2 \ --nnodes 2 \ --rdzv_id 2299 \ # A unique job id --rdzv_backend c10d \ @@ -198,6 +200,13 @@ with `pip install runhouse`, and you can refer to for hardware setup instructions, or this [Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough. +## SLURM Scripts +In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. + +In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--nproc_per_node`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. + +In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--nproc_per_node`), the `--rdzv_id` which is a unique identifier, the [`--rdzv_backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--rdzv_endpoint` which will be the address and port of the master node. + ## Finer Examples While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations. diff --git a/examples/Slurm/submit-multiGPU.sh b/examples/Slurm/submit-multiGPU.sh new file mode 100644 index 00000000000..1ee4afc1700 --- /dev/null +++ b/examples/Slurm/submit-multiGPU.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +#SBATCH --job-name=multigpu +#SBATCH -D . +#SBATCH --output=O-%x.%j +#SBATCH --error=E-%x.%j +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # number of MP tasks +#SBATCH --gres=gpu:4 # number of GPUs per node +#SBATCH --cpus-per-task=160 # number of cores per tasks +#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) + +###################### +### Set enviroment ### +###################### +source activateEnviroment.sh +###################### + +export SCRIPT=/accelerate/examples/complete_nlp_example.py +export SCRIPT_ARGS=" \ + --mixed_precision fp16 \ + --output_dir /accelerate/examples/output \ + " + +torchrun --nproc_per_node $SLURM_GPUS $SCRIPT $SCRIPT_ARGS \ No newline at end of file diff --git a/examples/Slurm/submit-multinode.sh b/examples/Slurm/submit-multinode.sh new file mode 100644 index 00000000000..951327f8827 --- /dev/null +++ b/examples/Slurm/submit-multinode.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +#SBATCH --job-name=multinode +#SBATCH -D . +#SBATCH --output=O-%x.%j +#SBATCH --error=E-%x.%j +#SBATCH --nodes=4 # number of nodes +#SBATCH --ntasks-per-node=1 # number of MP tasks +#SBATCH --gres=gpu:4 # number of GPUs per node +#SBATCH --cpus-per-task=160 # number of cores per tasks +#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) + +###################### +### Set enviroment ### +###################### +source activateEnviroment.sh +###################### + +###################### +#### Set network ##### +###################### +nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +nodes_array=($nodes) +head_node=${nodes_array[0]} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +###################### + +export LAUNCHER=" \ + torchrun \ + --nnodes $SLURM_NNODES \ + --nproc_per_node $SLURM_GPUS \ + --rdzv_id $RANDOM \ + --rdzv_backend c10d \ + --rdzv_endpoint $head_node_ip:29500 \ +" + +export SCRIPT=/accelerate/examples/complete_nlp_example.py +export SCRIPT_ARGS=" \ + --mixed_precision fp16 \ + --output_dir /accelerate/examples/output \ + " + +srun $LAUNCHER $SCRIPT $SCRIPT_ARGS \ No newline at end of file From 5a667b4b4d85e156bfbe49b1b9e1b439ca7e2295 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Thu, 2 Nov 2023 15:58:09 +0100 Subject: [PATCH 08/20] Update examples/Slurm/submit-multinode.sh Co-authored-by: Zach Mueller --- examples/Slurm/submit-multinode.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/Slurm/submit-multinode.sh b/examples/Slurm/submit-multinode.sh index 951327f8827..f7b3d433355 100644 --- a/examples/Slurm/submit-multinode.sh +++ b/examples/Slurm/submit-multinode.sh @@ -26,13 +26,11 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) ###################### export LAUNCHER=" \ - torchrun \ - --nnodes $SLURM_NNODES \ - --nproc_per_node $SLURM_GPUS \ - --rdzv_id $RANDOM \ - --rdzv_backend c10d \ - --rdzv_endpoint $head_node_ip:29500 \ -" + accelerate launch \ + --num_processes $((SLURM_NNODES * SLURM_GPUS)) \ + --num_machines $SLURM_NNODES \ + --rdzv_conf "id=$(RANDOM), backend=c10d", + --main_process_ip $head_node_ip \ export SCRIPT=/accelerate/examples/complete_nlp_example.py export SCRIPT_ARGS=" \ From c43be2571442c0a83020f07eb4c942968cc675b1 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Thu, 2 Nov 2023 15:58:18 +0100 Subject: [PATCH 09/20] Update examples/Slurm/submit-multiGPU.sh Co-authored-by: Zach Mueller --- examples/Slurm/submit-multiGPU.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Slurm/submit-multiGPU.sh b/examples/Slurm/submit-multiGPU.sh index 1ee4afc1700..f389072f0ce 100644 --- a/examples/Slurm/submit-multiGPU.sh +++ b/examples/Slurm/submit-multiGPU.sh @@ -22,4 +22,4 @@ export SCRIPT_ARGS=" \ --output_dir /accelerate/examples/output \ " -torchrun --nproc_per_node $SLURM_GPUS $SCRIPT $SCRIPT_ARGS \ No newline at end of file +accelerate launch --num_processes $SLURM_GPUS $SCRIPT $SCRIPT_ARGS \ No newline at end of file From bb959090211765ba92b51964ab21c925e927c6af Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Thu, 2 Nov 2023 15:58:39 +0100 Subject: [PATCH 10/20] Update examples/README.md Co-authored-by: Zach Mueller --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index 5cdd4182437..a369111a653 100644 --- a/examples/README.md +++ b/examples/README.md @@ -203,7 +203,7 @@ for hardware setup instructions, or this ## SLURM Scripts In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. -In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--nproc_per_node`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. +In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--nproc_per_node`), the `--rdzv_id` which is a unique identifier, the [`--rdzv_backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--rdzv_endpoint` which will be the address and port of the master node. From a5c26a83e8ee161ad1849b6c2d373bd37f316026 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Thu, 2 Nov 2023 16:04:54 +0100 Subject: [PATCH 11/20] Update examples/README.md Co-authored-by: Zach Mueller --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index a369111a653..fcb8dfd5266 100644 --- a/examples/README.md +++ b/examples/README.md @@ -205,7 +205,7 @@ In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-mu In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. -In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--nproc_per_node`), the `--rdzv_id` which is a unique identifier, the [`--rdzv_backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--rdzv_endpoint` which will be the address and port of the master node. +In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--num_machines`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `main_process_ip` which will be the address the master node. (Include `main_process_port` too if using another port other than 29500, the default) ## Finer Examples From b7becf9b4964ca2e5f7a0e88ee328070914b1bc9 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Thu, 2 Nov 2023 16:13:08 +0100 Subject: [PATCH 12/20] final details --- examples/README.md | 6 +++--- .../{Slurm/submit-multiGPU.sh => slurm/submit_multigpu.sh} | 0 .../submit-multinode.sh => slurm/submit_multinode.sh} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename examples/{Slurm/submit-multiGPU.sh => slurm/submit_multigpu.sh} (100%) rename examples/{Slurm/submit-multinode.sh => slurm/submit_multinode.sh} (100%) diff --git a/examples/README.md b/examples/README.md index fcb8dfd5266..cfb683fd88d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -201,11 +201,11 @@ for hardware setup instructions, or this [Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough. ## SLURM Scripts -In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. +In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. -In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. +In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. -In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--num_machines`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `main_process_ip` which will be the address the master node. (Include `main_process_port` too if using another port other than 29500, the default) +In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--main_process_ip` which will be the address the master node. (Include `--main_process_port` too if using another port other than 29500, the default) ## Finer Examples diff --git a/examples/Slurm/submit-multiGPU.sh b/examples/slurm/submit_multigpu.sh similarity index 100% rename from examples/Slurm/submit-multiGPU.sh rename to examples/slurm/submit_multigpu.sh diff --git a/examples/Slurm/submit-multinode.sh b/examples/slurm/submit_multinode.sh similarity index 100% rename from examples/Slurm/submit-multinode.sh rename to examples/slurm/submit_multinode.sh From c3bef5cdd11a8a120602b5b7ce158f7400881d7f Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 15 Nov 2023 01:26:03 +0100 Subject: [PATCH 13/20] modified argument parser --- examples/complete_cv_example.py | 2 +- examples/complete_nlp_example.py | 2 +- examples/cv_example.py | 2 +- examples/nlp_example.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/complete_cv_example.py b/examples/complete_cv_example.py index 0fcb6f747c2..c5c84f22127 100644 --- a/examples/complete_cv_example.py +++ b/examples/complete_cv_example.py @@ -310,7 +310,7 @@ def main(): default="logs", help="Location on where to store experiment tracking logs` and relevent project information", ) - args = parser.parse_args() + args, _ = parser.parse_known_args() config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224} training_function(config, args) diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py index 999e9082d9f..9f981ad5130 100644 --- a/examples/complete_nlp_example.py +++ b/examples/complete_nlp_example.py @@ -302,7 +302,7 @@ def main(): default="logs", help="Location on where to store experiment tracking logs` and relevent project information", ) - args = parser.parse_args() + args, _ = parser.parse_known_args() config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16} training_function(config, args) diff --git a/examples/cv_example.py b/examples/cv_example.py index 6d8ca126471..e0948b4e742 100644 --- a/examples/cv_example.py +++ b/examples/cv_example.py @@ -202,7 +202,7 @@ def main(): help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.") - args = parser.parse_args() + args, _ = parser.parse_known_args() config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224} training_function(config, args) diff --git a/examples/nlp_example.py b/examples/nlp_example.py index 636672f288a..60ba1aff224 100644 --- a/examples/nlp_example.py +++ b/examples/nlp_example.py @@ -200,7 +200,7 @@ def main(): "and an Nvidia Ampere GPU.", ) parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.") - args = parser.parse_args() + args, _ = parser.parse_known_args() config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16} training_function(config, args) From 39e5398455cbe48d79fdf810fa326fc57999e9a7 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 15 Nov 2023 01:27:12 +0100 Subject: [PATCH 14/20] modified slurm multigpu script --- examples/slurm/submit_multigpu.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/slurm/submit_multigpu.sh b/examples/slurm/submit_multigpu.sh index f389072f0ce..709d7490064 100644 --- a/examples/slurm/submit_multigpu.sh +++ b/examples/slurm/submit_multigpu.sh @@ -14,12 +14,14 @@ ### Set enviroment ### ###################### source activateEnviroment.sh +export GPUS_PER_NODE=4 ###################### export SCRIPT=/accelerate/examples/complete_nlp_example.py export SCRIPT_ARGS=" \ --mixed_precision fp16 \ --output_dir /accelerate/examples/output \ + --with_tracking \ " -accelerate launch --num_processes $SLURM_GPUS $SCRIPT $SCRIPT_ARGS \ No newline at end of file +accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS \ No newline at end of file From 5f07a9cdc2bb7e1bb20990411099c5f11378e504 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 15 Nov 2023 11:46:26 +0100 Subject: [PATCH 15/20] modified multinode slurm script --- examples/slurm/submit_multinode.sh | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh index f7b3d433355..2ac61fcaf79 100644 --- a/examples/slurm/submit_multinode.sh +++ b/examples/slurm/submit_multinode.sh @@ -14,28 +14,27 @@ ### Set enviroment ### ###################### source activateEnviroment.sh +export GPUS_PER_NODE=4 ###################### ###################### #### Set network ##### ###################### -nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) -nodes_array=($nodes) -head_node=${nodes_array[0]} -head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) ###################### -export LAUNCHER=" \ - accelerate launch \ - --num_processes $((SLURM_NNODES * SLURM_GPUS)) \ - --num_machines $SLURM_NNODES \ - --rdzv_conf "id=$(RANDOM), backend=c10d", +export LAUNCHER="accelerate launch \ + --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ + --num_machines $SLURM_NNODES \ + --rdzv_backend c10d \ --main_process_ip $head_node_ip \ - -export SCRIPT=/accelerate/examples/complete_nlp_example.py + --main_process_port 29500 \ + " +export SCRIPT="/accelerate/examples/complete_nlp_example.py" export SCRIPT_ARGS=" \ --mixed_precision fp16 \ --output_dir /accelerate/examples/output \ " -srun $LAUNCHER $SCRIPT $SCRIPT_ARGS \ No newline at end of file +export CMD="$LAUNCHER $PYTHON_FILE $ARGS" +srun $CMD \ No newline at end of file From 6525931792e83f8850f5d67ffcd759cdef246824 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 15 Nov 2023 11:53:09 +0100 Subject: [PATCH 16/20] Added accelerate multine issue --- examples/slurm/submit_multinode.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh index 2ac61fcaf79..dfd3fa023c5 100644 --- a/examples/slurm/submit_multinode.sh +++ b/examples/slurm/submit_multinode.sh @@ -35,6 +35,7 @@ export SCRIPT_ARGS=" \ --mixed_precision fp16 \ --output_dir /accelerate/examples/output \ " - -export CMD="$LAUNCHER $PYTHON_FILE $ARGS" + +# This step is necessary because accelerate launch does not handle multiline arguments properly +export CMD="$LAUNCHER $PYTHON_FILE $ARGS" srun $CMD \ No newline at end of file From 55f3031d67b554b558172f1f5c0adbd7eae20da6 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:55:03 +0100 Subject: [PATCH 17/20] Update examples/README.md Co-authored-by: Zach Mueller --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index cfb683fd88d..cb2a77f6904 100644 --- a/examples/README.md +++ b/examples/README.md @@ -74,7 +74,7 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./nlp_example.py # This will run the script on each server ``` - * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node: + * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node: ```bash torchrun \ # python -m torch.distributed.run --nproc_per_node 2 \ From 72069d5be52f2f70faab324999848b0af55fed9d Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 15 Nov 2023 11:57:40 +0100 Subject: [PATCH 18/20] fixed readme commnad --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index cfb683fd88d..a2a75dc0c59 100644 --- a/examples/README.md +++ b/examples/README.md @@ -74,7 +74,7 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./nlp_example.py # This will run the script on each server ``` - * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node: + * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node: ```bash torchrun \ # python -m torch.distributed.run --nproc_per_node 2 \ @@ -159,7 +159,7 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./cv_example.py --data_dir path_to_data # This will run the script on each server ``` - * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node: + * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node: ```bash torchrun \ # python -m torch.distributed.run --nproc_per_node 2 \ From 849139613a7c83aff719588616ad6275ef389b08 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 15 Nov 2023 12:11:49 +0100 Subject: [PATCH 19/20] added --main_process_port specification to readme --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index a2a75dc0c59..a6e6617d088 100644 --- a/examples/README.md +++ b/examples/README.md @@ -205,7 +205,7 @@ In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_mu In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. -In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--main_process_ip` which will be the address the master node. (Include `--main_process_port` too if using another port other than 29500, the default) +In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`. ## Finer Examples From d1170b19fe4babccd441d4509def3033179da830 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Wed, 15 Nov 2023 16:48:23 +0100 Subject: [PATCH 20/20] Revert "modified argument parser" This reverts commit c3bef5cdd11a8a120602b5b7ce158f7400881d7f. --- examples/complete_cv_example.py | 2 +- examples/complete_nlp_example.py | 2 +- examples/cv_example.py | 2 +- examples/nlp_example.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/complete_cv_example.py b/examples/complete_cv_example.py index c5c84f22127..0fcb6f747c2 100644 --- a/examples/complete_cv_example.py +++ b/examples/complete_cv_example.py @@ -310,7 +310,7 @@ def main(): default="logs", help="Location on where to store experiment tracking logs` and relevent project information", ) - args, _ = parser.parse_known_args() + args = parser.parse_args() config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224} training_function(config, args) diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py index 9f981ad5130..999e9082d9f 100644 --- a/examples/complete_nlp_example.py +++ b/examples/complete_nlp_example.py @@ -302,7 +302,7 @@ def main(): default="logs", help="Location on where to store experiment tracking logs` and relevent project information", ) - args, _ = parser.parse_known_args() + args = parser.parse_args() config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16} training_function(config, args) diff --git a/examples/cv_example.py b/examples/cv_example.py index e0948b4e742..6d8ca126471 100644 --- a/examples/cv_example.py +++ b/examples/cv_example.py @@ -202,7 +202,7 @@ def main(): help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.") - args, _ = parser.parse_known_args() + args = parser.parse_args() config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224} training_function(config, args) diff --git a/examples/nlp_example.py b/examples/nlp_example.py index 60ba1aff224..636672f288a 100644 --- a/examples/nlp_example.py +++ b/examples/nlp_example.py @@ -200,7 +200,7 @@ def main(): "and an Nvidia Ampere GPU.", ) parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.") - args, _ = parser.parse_known_args() + args = parser.parse_args() config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16} training_function(config, args)