From cb53dd1f63127dc15f8f87ddeb609f9b63eae2f7 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Fri, 27 Oct 2023 13:39:55 +0200
Subject: [PATCH 01/20] Updated torchrun instructions

---
 examples/README.md | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 06e06db0f8c..e0ebed846d4 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -66,7 +66,7 @@ To run it in each of these various modes, use the following commands:
         ```
     * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
         ```bash
-        python -m torchrun --nproc_per_node 2 --use_env ./nlp_example.py
+        torchrun --nproc_per_node 2 ./nlp_example.py
         ```
 - multi GPUs, multi node (several machines, using PyTorch distributed mode)
     * With Accelerate config and launcher, on each machine:
@@ -76,16 +76,19 @@ To run it in each of these various modes, use the following commands:
         ```
     * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch)
         ```bash
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 0 \
-            --master_addr master_node_ip_address \
-            ./nlp_example.py  # On the first server
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 1 \
-            --master_addr master_node_ip_address \
-            ./nlp_example.py  # On the second server
+        torchrun --nproc_per_node 2 \
+            --nnodes 2
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
+            ./nlp_example.py # On the first server
+
+        torchrun --nproc_per_node 2 \
+            --nnodes 2
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
+            ./nlp_example.py # On the second server
         ```
 - (multi) TPUs
     * With Accelerate config and launcher
@@ -154,7 +157,7 @@ To run it in each of these various modes, use the following commands:
         ```
     * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
         ```bash
-        python -m torchrun --nproc_per_node 2 --use_env ./cv_example.py --data_dir path_to_data
+        torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data
         ```
 - multi GPUs, multi node (several machines, using PyTorch distributed mode)
     * With Accelerate config and launcher, on each machine:
@@ -164,15 +167,18 @@ To run it in each of these various modes, use the following commands:
         ```
     * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch)
         ```bash
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 0 \
-            --master_addr master_node_ip_address \
+        torchrun --nproc_per_node 2 \
+            --nnodes 2
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
             ./cv_example.py --data_dir path_to_data  # On the first server
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 1 \
-            --master_addr master_node_ip_address \
+
+        torchrun --nproc_per_node 2 \
+            --nnodes 2
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
             ./cv_example.py --data_dir path_to_data  # On the second server
         ```
 - (multi) TPUs

From 233ff9ebe477446f61a903f362e7c4cef70a112b Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:49:08 +0200
Subject: [PATCH 02/20] Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index e0ebed846d4..a54757ba267 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -77,7 +77,7 @@ To run it in each of these various modes, use the following commands:
     * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch)
         ```bash
         torchrun --nproc_per_node 2 \
-            --nnodes 2
+            --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
             --rdzv_endpoint master_node_ip_address:29500 \

From d83a084a951f4543fe90caa683ce13e2a41c3214 Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:49:17 +0200
Subject: [PATCH 03/20] Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index a54757ba267..e17db84f615 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -84,7 +84,7 @@ To run it in each of these various modes, use the following commands:
             ./nlp_example.py # On the first server
 
         torchrun --nproc_per_node 2 \
-            --nnodes 2
+            --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
             --rdzv_endpoint master_node_ip_address:29500 \

From 824b3d8d9a58507a582cede3357c7beef20425c5 Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:49:43 +0200
Subject: [PATCH 04/20] Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index e17db84f615..8c266ca3d0c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -168,7 +168,7 @@ To run it in each of these various modes, use the following commands:
     * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch)
         ```bash
         torchrun --nproc_per_node 2 \
-            --nnodes 2
+            --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
             --rdzv_endpoint master_node_ip_address:29500 \

From 9604aa1f584403b54da131a9f7b200dfa5cd1db4 Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:50:08 +0200
Subject: [PATCH 05/20] Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index 8c266ca3d0c..ebee4ef612e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -175,7 +175,7 @@ To run it in each of these various modes, use the following commands:
             ./cv_example.py --data_dir path_to_data  # On the first server
 
         torchrun --nproc_per_node 2 \
-            --nnodes 2
+            --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
             --rdzv_endpoint master_node_ip_address:29500 \

From b9665bb197ef2f0cbb6a0ae876a56eb70b32088c Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Fri, 27 Oct 2023 15:11:08 +0200
Subject: [PATCH 06/20] Update README.md for torchrun instructions

---
 examples/README.md | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index ebee4ef612e..2391c4df46d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -64,7 +64,7 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on your server
         accelerate launch ./nlp_example.py  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With traditional PyTorch launcher
         ```bash
         torchrun --nproc_per_node 2 ./nlp_example.py
         ```
@@ -74,21 +74,14 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./nlp_example.py  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch)
+    * With PyTorch launcher only. Run this commnad on each node:
         ```bash
         torchrun --nproc_per_node 2 \
             --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
             --rdzv_endpoint master_node_ip_address:29500 \
-            ./nlp_example.py # On the first server
-
-        torchrun --nproc_per_node 2 \
-            --nnodes 2 \
-            --rdzv_id 2299 \ # A unique job id 
-            --rdzv_backend c10d \
-            --rdzv_endpoint master_node_ip_address:29500 \
-            ./nlp_example.py # On the second server
+            ./nlp_example.py
         ```
 - (multi) TPUs
     * With Accelerate config and launcher
@@ -155,7 +148,7 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on your server
         accelerate launch ./cv_example.py --data_dir path_to_data  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With traditional PyTorch launcher
         ```bash
         torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data
         ```
@@ -165,21 +158,14 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./cv_example.py --data_dir path_to_data  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With PyTorch launcher only. Run this commnad on each node:
         ```bash
         torchrun --nproc_per_node 2 \
             --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
             --rdzv_endpoint master_node_ip_address:29500 \
-            ./cv_example.py --data_dir path_to_data  # On the first server
-
-        torchrun --nproc_per_node 2 \
-            --nnodes 2 \
-            --rdzv_id 2299 \ # A unique job id 
-            --rdzv_backend c10d \
-            --rdzv_endpoint master_node_ip_address:29500 \
-            ./cv_example.py --data_dir path_to_data  # On the second server
+            ./cv_example.py --data_dir path_to_data
         ```
 - (multi) TPUs
     * With Accelerate config and launcher

From 64954daf054fa755ffa4c302292cc8078142b184 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Tue, 31 Oct 2023 20:00:42 +0100
Subject: [PATCH 07/20] Added SLURM scripts and updated README

---
 examples/README.md                 | 21 ++++++++++-----
 examples/Slurm/submit-multiGPU.sh  | 25 +++++++++++++++++
 examples/Slurm/submit-multinode.sh | 43 ++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 6 deletions(-)
 create mode 100644 examples/Slurm/submit-multiGPU.sh
 create mode 100644 examples/Slurm/submit-multinode.sh

diff --git a/examples/README.md b/examples/README.md
index 2391c4df46d..5cdd4182437 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -64,7 +64,7 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on your server
         accelerate launch ./nlp_example.py  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher
+    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
         ```bash
         torchrun --nproc_per_node 2 ./nlp_example.py
         ```
@@ -74,9 +74,10 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./nlp_example.py  # This will run the script on each server
         ```
-    * With PyTorch launcher only. Run this commnad on each node:
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node:
         ```bash
-        torchrun --nproc_per_node 2 \
+        torchrun \ # python -m torch.distributed.run 
+            --nproc_per_node 2 \
             --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
@@ -148,7 +149,7 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on your server
         accelerate launch ./cv_example.py --data_dir path_to_data  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher
+    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
         ```bash
         torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data
         ```
@@ -158,9 +159,10 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./cv_example.py --data_dir path_to_data  # This will run the script on each server
         ```
-    * With PyTorch launcher only. Run this commnad on each node:
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node:
         ```bash
-        torchrun --nproc_per_node 2 \
+        torchrun \ # python -m torch.distributed.run
+            --nproc_per_node 2 \
             --nnodes 2 \
             --rdzv_id 2299 \ # A unique job id 
             --rdzv_backend c10d \
@@ -198,6 +200,13 @@ with `pip install runhouse`, and you can refer to
 for hardware setup instructions, or this
 [Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough.
 
+## SLURM Scripts 
+In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. 
+
+In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--nproc_per_node`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
+
+In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--nproc_per_node`), the `--rdzv_id` which is a unique identifier, the [`--rdzv_backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--rdzv_endpoint` which will be the address and port of the master node.
+
 ## Finer Examples
 
 While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations.
diff --git a/examples/Slurm/submit-multiGPU.sh b/examples/Slurm/submit-multiGPU.sh
new file mode 100644
index 00000000000..1ee4afc1700
--- /dev/null
+++ b/examples/Slurm/submit-multiGPU.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#SBATCH --job-name=multigpu
+#SBATCH -D .
+#SBATCH --output=O-%x.%j
+#SBATCH --error=E-%x.%j
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1         # number of MP tasks
+#SBATCH --gres=gpu:4                # number of GPUs per node
+#SBATCH --cpus-per-task=160         # number of cores per tasks
+#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)
+
+######################
+### Set enviroment ###
+######################
+source activateEnviroment.sh
+######################
+
+export SCRIPT=/accelerate/examples/complete_nlp_example.py
+export SCRIPT_ARGS=" \
+    --mixed_precision fp16 \
+    --output_dir /accelerate/examples/output \
+    "
+
+torchrun --nproc_per_node $SLURM_GPUS $SCRIPT $SCRIPT_ARGS
\ No newline at end of file
diff --git a/examples/Slurm/submit-multinode.sh b/examples/Slurm/submit-multinode.sh
new file mode 100644
index 00000000000..951327f8827
--- /dev/null
+++ b/examples/Slurm/submit-multinode.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+#SBATCH --job-name=multinode
+#SBATCH -D .
+#SBATCH --output=O-%x.%j
+#SBATCH --error=E-%x.%j
+#SBATCH --nodes=4                   # number of nodes
+#SBATCH --ntasks-per-node=1         # number of MP tasks
+#SBATCH --gres=gpu:4                # number of GPUs per node
+#SBATCH --cpus-per-task=160         # number of cores per tasks
+#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)
+
+######################
+### Set enviroment ###
+######################
+source activateEnviroment.sh
+######################
+
+######################
+#### Set network #####
+######################
+nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+######################
+
+export LAUNCHER=" \
+    torchrun \
+    --nnodes $SLURM_NNODES \
+    --nproc_per_node $SLURM_GPUS \
+    --rdzv_id $RANDOM \
+    --rdzv_backend c10d \
+    --rdzv_endpoint $head_node_ip:29500 \
+"
+
+export SCRIPT=/accelerate/examples/complete_nlp_example.py
+export SCRIPT_ARGS=" \
+    --mixed_precision fp16 \
+    --output_dir /accelerate/examples/output \
+    "
+
+srun $LAUNCHER $SCRIPT $SCRIPT_ARGS
\ No newline at end of file

From 5a667b4b4d85e156bfbe49b1b9e1b439ca7e2295 Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:58:09 +0100
Subject: [PATCH 08/20] Update examples/Slurm/submit-multinode.sh

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
---
 examples/Slurm/submit-multinode.sh | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/examples/Slurm/submit-multinode.sh b/examples/Slurm/submit-multinode.sh
index 951327f8827..f7b3d433355 100644
--- a/examples/Slurm/submit-multinode.sh
+++ b/examples/Slurm/submit-multinode.sh
@@ -26,13 +26,11 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
 ######################
 
 export LAUNCHER=" \
-    torchrun \
-    --nnodes $SLURM_NNODES \
-    --nproc_per_node $SLURM_GPUS \
-    --rdzv_id $RANDOM \
-    --rdzv_backend c10d \
-    --rdzv_endpoint $head_node_ip:29500 \
-"
+    accelerate launch \
+    --num_processes $((SLURM_NNODES * SLURM_GPUS)) \
+    --num_machines $SLURM_NNODES \ 
+    --rdzv_conf "id=$(RANDOM), backend=c10d",
+    --main_process_ip $head_node_ip \
 
 export SCRIPT=/accelerate/examples/complete_nlp_example.py
 export SCRIPT_ARGS=" \

From c43be2571442c0a83020f07eb4c942968cc675b1 Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:58:18 +0100
Subject: [PATCH 09/20] Update examples/Slurm/submit-multiGPU.sh

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
---
 examples/Slurm/submit-multiGPU.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/Slurm/submit-multiGPU.sh b/examples/Slurm/submit-multiGPU.sh
index 1ee4afc1700..f389072f0ce 100644
--- a/examples/Slurm/submit-multiGPU.sh
+++ b/examples/Slurm/submit-multiGPU.sh
@@ -22,4 +22,4 @@ export SCRIPT_ARGS=" \
     --output_dir /accelerate/examples/output \
     "
 
-torchrun --nproc_per_node $SLURM_GPUS $SCRIPT $SCRIPT_ARGS
\ No newline at end of file
+accelerate launch --num_processes $SLURM_GPUS $SCRIPT $SCRIPT_ARGS
\ No newline at end of file

From bb959090211765ba92b51964ab21c925e927c6af Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:58:39 +0100
Subject: [PATCH 10/20] Update examples/README.md

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index 5cdd4182437..a369111a653 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -203,7 +203,7 @@ for hardware setup instructions, or this
 ## SLURM Scripts 
 In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. 
 
-In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--nproc_per_node`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
+In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
 
 In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--nproc_per_node`), the `--rdzv_id` which is a unique identifier, the [`--rdzv_backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--rdzv_endpoint` which will be the address and port of the master node.
 

From a5c26a83e8ee161ad1849b6c2d373bd37f316026 Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Thu, 2 Nov 2023 16:04:54 +0100
Subject: [PATCH 11/20] Update examples/README.md

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index a369111a653..fcb8dfd5266 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -205,7 +205,7 @@ In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-mu
 
 In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
 
-In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--nproc_per_node`), the `--rdzv_id` which is a unique identifier, the [`--rdzv_backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--rdzv_endpoint` which will be the address and port of the master node.
+In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--num_machines`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `main_process_ip` which will be the address the master node. (Include `main_process_port` too if using another port other than 29500, the default)
 
 ## Finer Examples
 

From b7becf9b4964ca2e5f7a0e88ee328070914b1bc9 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 2 Nov 2023 16:13:08 +0100
Subject: [PATCH 12/20] final details

---
 examples/README.md                                          | 6 +++---
 .../{Slurm/submit-multiGPU.sh => slurm/submit_multigpu.sh}  | 0
 .../submit-multinode.sh => slurm/submit_multinode.sh}       | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename examples/{Slurm/submit-multiGPU.sh => slurm/submit_multigpu.sh} (100%)
 rename examples/{Slurm/submit-multinode.sh => slurm/submit_multinode.sh} (100%)

diff --git a/examples/README.md b/examples/README.md
index fcb8dfd5266..cfb683fd88d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -201,11 +201,11 @@ for hardware setup instructions, or this
 [Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough.
 
 ## SLURM Scripts 
-In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) and [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. 
+In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. 
 
-In [/Slurm/submit-multiGPU.sh](./Slurm/submit-multiGPU.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
+In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
 
-In [/Slurm/submit-multinode.sh](./Slurm/submit-multinode.sh) we must specify the number of nodes that will be part of the training (`--nnodes`), how many GPUs we will use per node (`--num_machines`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `main_process_ip` which will be the address the master node. (Include `main_process_port` too if using another port other than 29500, the default)
+In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--main_process_ip` which will be the address the master node. (Include `--main_process_port` too if using another port other than 29500, the default)
 
 ## Finer Examples
 
diff --git a/examples/Slurm/submit-multiGPU.sh b/examples/slurm/submit_multigpu.sh
similarity index 100%
rename from examples/Slurm/submit-multiGPU.sh
rename to examples/slurm/submit_multigpu.sh
diff --git a/examples/Slurm/submit-multinode.sh b/examples/slurm/submit_multinode.sh
similarity index 100%
rename from examples/Slurm/submit-multinode.sh
rename to examples/slurm/submit_multinode.sh

From c3bef5cdd11a8a120602b5b7ce158f7400881d7f Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 15 Nov 2023 01:26:03 +0100
Subject: [PATCH 13/20] modified argument parser

---
 examples/complete_cv_example.py  | 2 +-
 examples/complete_nlp_example.py | 2 +-
 examples/cv_example.py           | 2 +-
 examples/nlp_example.py          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/complete_cv_example.py b/examples/complete_cv_example.py
index 0fcb6f747c2..c5c84f22127 100644
--- a/examples/complete_cv_example.py
+++ b/examples/complete_cv_example.py
@@ -310,7 +310,7 @@ def main():
         default="logs",
         help="Location on where to store experiment tracking logs` and relevent project information",
     )
-    args = parser.parse_args()
+    args, _ = parser.parse_known_args()
     config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
     training_function(config, args)
 
diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py
index 999e9082d9f..9f981ad5130 100644
--- a/examples/complete_nlp_example.py
+++ b/examples/complete_nlp_example.py
@@ -302,7 +302,7 @@ def main():
         default="logs",
         help="Location on where to store experiment tracking logs` and relevent project information",
     )
-    args = parser.parse_args()
+    args, _ = parser.parse_known_args()
     config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
     training_function(config, args)
 
diff --git a/examples/cv_example.py b/examples/cv_example.py
index 6d8ca126471..e0948b4e742 100644
--- a/examples/cv_example.py
+++ b/examples/cv_example.py
@@ -202,7 +202,7 @@ def main():
         help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
     )
     parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
-    args = parser.parse_args()
+    args, _ = parser.parse_known_args()
     config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
     training_function(config, args)
 
diff --git a/examples/nlp_example.py b/examples/nlp_example.py
index 636672f288a..60ba1aff224 100644
--- a/examples/nlp_example.py
+++ b/examples/nlp_example.py
@@ -200,7 +200,7 @@ def main():
         "and an Nvidia Ampere GPU.",
     )
     parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
-    args = parser.parse_args()
+    args, _ = parser.parse_known_args()
     config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
     training_function(config, args)
 

From 39e5398455cbe48d79fdf810fa326fc57999e9a7 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 15 Nov 2023 01:27:12 +0100
Subject: [PATCH 14/20] modified slurm multigpu script

---
 examples/slurm/submit_multigpu.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/slurm/submit_multigpu.sh b/examples/slurm/submit_multigpu.sh
index f389072f0ce..709d7490064 100644
--- a/examples/slurm/submit_multigpu.sh
+++ b/examples/slurm/submit_multigpu.sh
@@ -14,12 +14,14 @@
 ### Set enviroment ###
 ######################
 source activateEnviroment.sh
+export GPUS_PER_NODE=4
 ######################
 
 export SCRIPT=/accelerate/examples/complete_nlp_example.py
 export SCRIPT_ARGS=" \
     --mixed_precision fp16 \
     --output_dir /accelerate/examples/output \
+    --with_tracking \
     "
 
-accelerate launch --num_processes $SLURM_GPUS $SCRIPT $SCRIPT_ARGS
\ No newline at end of file
+accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS
\ No newline at end of file

From 5f07a9cdc2bb7e1bb20990411099c5f11378e504 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 15 Nov 2023 11:46:26 +0100
Subject: [PATCH 15/20] modified multinode slurm script

---
 examples/slurm/submit_multinode.sh | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh
index f7b3d433355..2ac61fcaf79 100644
--- a/examples/slurm/submit_multinode.sh
+++ b/examples/slurm/submit_multinode.sh
@@ -14,28 +14,27 @@
 ### Set enviroment ###
 ######################
 source activateEnviroment.sh
+export GPUS_PER_NODE=4
 ######################
 
 ######################
 #### Set network #####
 ######################
-nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
-nodes_array=($nodes)
-head_node=${nodes_array[0]}
-head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 ######################
 
-export LAUNCHER=" \
-    accelerate launch \
-    --num_processes $((SLURM_NNODES * SLURM_GPUS)) \
-    --num_machines $SLURM_NNODES \ 
-    --rdzv_conf "id=$(RANDOM), backend=c10d",
+export LAUNCHER="accelerate launch \
+    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
+    --num_machines $SLURM_NNODES \
+    --rdzv_backend c10d \
     --main_process_ip $head_node_ip \
-
-export SCRIPT=/accelerate/examples/complete_nlp_example.py
+    --main_process_port 29500 \
+    "
+export SCRIPT="/accelerate/examples/complete_nlp_example.py"
 export SCRIPT_ARGS=" \
     --mixed_precision fp16 \
     --output_dir /accelerate/examples/output \
     "
 
-srun $LAUNCHER $SCRIPT $SCRIPT_ARGS
\ No newline at end of file
+export CMD="$LAUNCHER $PYTHON_FILE $ARGS"
+srun $CMD
\ No newline at end of file

From 6525931792e83f8850f5d67ffcd759cdef246824 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 15 Nov 2023 11:53:09 +0100
Subject: [PATCH 16/20] Added accelerate multine issue

---
 examples/slurm/submit_multinode.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh
index 2ac61fcaf79..dfd3fa023c5 100644
--- a/examples/slurm/submit_multinode.sh
+++ b/examples/slurm/submit_multinode.sh
@@ -35,6 +35,7 @@ export SCRIPT_ARGS=" \
     --mixed_precision fp16 \
     --output_dir /accelerate/examples/output \
     "
-
-export CMD="$LAUNCHER $PYTHON_FILE $ARGS"
+    
+# This step is necessary because accelerate launch does not handle multiline arguments properly
+export CMD="$LAUNCHER $PYTHON_FILE $ARGS" 
 srun $CMD
\ No newline at end of file

From 55f3031d67b554b558172f1f5c0adbd7eae20da6 Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Wed, 15 Nov 2023 11:55:03 +0100
Subject: [PATCH 17/20] Update examples/README.md

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index cfb683fd88d..cb2a77f6904 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -74,7 +74,7 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./nlp_example.py  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node:
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
         ```bash
         torchrun \ # python -m torch.distributed.run 
             --nproc_per_node 2 \

From 72069d5be52f2f70faab324999848b0af55fed9d Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 15 Nov 2023 11:57:40 +0100
Subject: [PATCH 18/20] fixed readme commnad

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index cfb683fd88d..a2a75dc0c59 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -74,7 +74,7 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./nlp_example.py  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node:
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
         ```bash
         torchrun \ # python -m torch.distributed.run 
             --nproc_per_node 2 \
@@ -159,7 +159,7 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./cv_example.py --data_dir path_to_data  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this commnad on each node:
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
         ```bash
         torchrun \ # python -m torch.distributed.run
             --nproc_per_node 2 \

From 849139613a7c83aff719588616ad6275ef389b08 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 15 Nov 2023 12:11:49 +0100
Subject: [PATCH 19/20] added --main_process_port specification to readme

---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index a2a75dc0c59..a6e6617d088 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -205,7 +205,7 @@ In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_mu
 
 In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
 
-In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the `id` which is a unique identifier, the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend) and the `--main_process_ip` which will be the address the master node. (Include `--main_process_port` too if using another port other than 29500, the default)
+In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`.
 
 ## Finer Examples
 

From d1170b19fe4babccd441d4509def3033179da830 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 15 Nov 2023 16:48:23 +0100
Subject: [PATCH 20/20] Revert "modified argument parser"

This reverts commit c3bef5cdd11a8a120602b5b7ce158f7400881d7f.
---
 examples/complete_cv_example.py  | 2 +-
 examples/complete_nlp_example.py | 2 +-
 examples/cv_example.py           | 2 +-
 examples/nlp_example.py          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/complete_cv_example.py b/examples/complete_cv_example.py
index c5c84f22127..0fcb6f747c2 100644
--- a/examples/complete_cv_example.py
+++ b/examples/complete_cv_example.py
@@ -310,7 +310,7 @@ def main():
         default="logs",
         help="Location on where to store experiment tracking logs` and relevent project information",
     )
-    args, _ = parser.parse_known_args()
+    args = parser.parse_args()
     config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
     training_function(config, args)
 
diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py
index 9f981ad5130..999e9082d9f 100644
--- a/examples/complete_nlp_example.py
+++ b/examples/complete_nlp_example.py
@@ -302,7 +302,7 @@ def main():
         default="logs",
         help="Location on where to store experiment tracking logs` and relevent project information",
     )
-    args, _ = parser.parse_known_args()
+    args = parser.parse_args()
     config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
     training_function(config, args)
 
diff --git a/examples/cv_example.py b/examples/cv_example.py
index e0948b4e742..6d8ca126471 100644
--- a/examples/cv_example.py
+++ b/examples/cv_example.py
@@ -202,7 +202,7 @@ def main():
         help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
     )
     parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
-    args, _ = parser.parse_known_args()
+    args = parser.parse_args()
     config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
     training_function(config, args)
 
diff --git a/examples/nlp_example.py b/examples/nlp_example.py
index 60ba1aff224..636672f288a 100644
--- a/examples/nlp_example.py
+++ b/examples/nlp_example.py
@@ -200,7 +200,7 @@ def main():
         "and an Nvidia Ampere GPU.",
     )
     parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
-    args, _ = parser.parse_known_args()
+    args = parser.parse_args()
     config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
     training_function(config, args)