From 81f1d789aac965b3de31009ef81bac3f791f0ca4 Mon Sep 17 00:00:00 2001
From: ffrancesco94 <ffiusco94@gmail.com>
Date: Fri, 8 Nov 2024 14:24:46 +0100
Subject: [PATCH 1/2] Update complete_nlp_example.py

Compute metric with evaluate from main process only, avoiding bug in multinode evaluate.
---
 examples/complete_nlp_example.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py
index a692cb7b773..2de20fd01c9 100644
--- a/examples/complete_nlp_example.py
+++ b/examples/complete_nlp_example.py
@@ -246,9 +246,14 @@ def collate_fn(examples):
                 references=references,
             )
 
-        eval_metric = metric.compute()
-        # Use accelerator.print to print only on the main process.
-        accelerator.print(f"epoch {epoch}:", eval_metric)
+        if accelerator.is_main_process:
+            # Computing metrics in a distributed manner requires calling evaluate.load() with the
+            # n_process and process_id arguments. However, the metric.add_batch() step will fail 
+            # due to a bug with datasets and evaluate (see https://github.com/huggingface/evaluate/issues/542)
+            # and related
+            eval_metric = metric.compute()
+            # Use accelerator.print to print only on the main process.
+            accelerator.print(f"epoch {epoch}:", eval_metric)
         if args.with_tracking:
             accelerator.log(
                 {

From b71085a7c5ceb452d5e7ab9219e3f65d2922c95f Mon Sep 17 00:00:00 2001
From: ffrancesco94 <ffiusco94@gmail.com>
Date: Fri, 8 Nov 2024 14:32:37 +0100
Subject: [PATCH 2/2] Update submit_multinode.sh

Enforce --multi_gpu on multiple nodes. Moreover, make sure that each rank gets correctly addressed based on the $SLURM_PROCID. $SLURM_NNODES has now been deprecated and replaced by $SLURM_JOB_NUM_NODES. Fixed typo in $CMD as well.
---
 examples/slurm/submit_multinode.sh | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh
index 61d5ff2c6a2..2fd9fc7b2ae 100644
--- a/examples/slurm/submit_multinode.sh
+++ b/examples/slurm/submit_multinode.sh
@@ -7,13 +7,14 @@
 #SBATCH --nodes=4                   # number of nodes
 #SBATCH --ntasks-per-node=1         # number of MP tasks
 #SBATCH --gres=gpu:4                # number of GPUs per node
-#SBATCH --cpus-per-task=160         # number of cores per tasks
+#SBATCH --cpus-per-task=80         # number of cores per tasks
 #SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)
 
 ######################
 ### Set enviroment ###
 ######################
 source activateEnvironment.sh
+export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
 export GPUS_PER_NODE=4
 ######################
 
@@ -21,14 +22,18 @@ export GPUS_PER_NODE=4
 #### Set network #####
 ######################
 head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+NODE_RANK=$SLURM_PROCID
 ######################
 
 export LAUNCHER="accelerate launch \
-    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
-    --num_machines $SLURM_NNODES \
+    --num_processes $((SLURM_JOB_NUM_NODES * GPUS_PER_NODE)) \
+    --num_machines $SLURM_JOB_NUM_NODES \
     --rdzv_backend c10d \
     --main_process_ip $head_node_ip \
     --main_process_port 29500 \
+    --role $SLURMD_NODENAME: \
+    --machine_rank $NODE_RANK \
+    --multi_gpu \
     "
 export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
 export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
@@ -38,5 +43,5 @@ export SCRIPT_ARGS=" \
     "
     
 # This step is necessary because accelerate launch does not handle multiline arguments properly
-export CMD="$LAUNCHER $PYTHON_FILE $ARGS" 
-srun $CMD
\ No newline at end of file
+export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" 
+srun $CMD