From 81f1d789aac965b3de31009ef81bac3f791f0ca4 Mon Sep 17 00:00:00 2001 From: ffrancesco94 Date: Fri, 8 Nov 2024 14:24:46 +0100 Subject: [PATCH 1/2] Update complete_nlp_example.py Compute metric with evaluate from main process only, avoiding bug in multinode evaluate. --- examples/complete_nlp_example.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py index a692cb7b773..2de20fd01c9 100644 --- a/examples/complete_nlp_example.py +++ b/examples/complete_nlp_example.py @@ -246,9 +246,14 @@ def collate_fn(examples): references=references, ) - eval_metric = metric.compute() - # Use accelerator.print to print only on the main process. - accelerator.print(f"epoch {epoch}:", eval_metric) + if accelerator.is_main_process: + # Computing metrics in a distributed manner requires calling evaluate.load() with the + # n_process and process_id arguments. However, the metric.add_batch() step will fail + # due to a bug with datasets and evaluate (see https://github.com/huggingface/evaluate/issues/542) + # and related + eval_metric = metric.compute() + # Use accelerator.print to print only on the main process. + accelerator.print(f"epoch {epoch}:", eval_metric) if args.with_tracking: accelerator.log( { From b71085a7c5ceb452d5e7ab9219e3f65d2922c95f Mon Sep 17 00:00:00 2001 From: ffrancesco94 Date: Fri, 8 Nov 2024 14:32:37 +0100 Subject: [PATCH 2/2] Update submit_multinode.sh Enforce --multi_gpu on multiple nodes. Moreover, make sure that each rank gets correctly addressed based on the $SLURM_PROCID. $SLURM_NNODES has now been deprecated and replaced by $SLURM_JOB_NUM_NODES. Fixed typo in $CMD as well. --- examples/slurm/submit_multinode.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh index 61d5ff2c6a2..2fd9fc7b2ae 100644 --- a/examples/slurm/submit_multinode.sh +++ b/examples/slurm/submit_multinode.sh @@ -7,13 +7,14 @@ #SBATCH --nodes=4 # number of nodes #SBATCH --ntasks-per-node=1 # number of MP tasks #SBATCH --gres=gpu:4 # number of GPUs per node -#SBATCH --cpus-per-task=160 # number of cores per tasks +#SBATCH --cpus-per-task=80 # number of cores per tasks #SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) ###################### ### Set enviroment ### ###################### source activateEnvironment.sh +export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} export GPUS_PER_NODE=4 ###################### @@ -21,14 +22,18 @@ export GPUS_PER_NODE=4 #### Set network ##### ###################### head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +NODE_RANK=$SLURM_PROCID ###################### export LAUNCHER="accelerate launch \ - --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ - --num_machines $SLURM_NNODES \ + --num_processes $((SLURM_JOB_NUM_NODES * GPUS_PER_NODE)) \ + --num_machines $SLURM_JOB_NUM_NODES \ --rdzv_backend c10d \ --main_process_ip $head_node_ip \ --main_process_port 29500 \ + --role $SLURMD_NODENAME: \ + --machine_rank $NODE_RANK \ + --multi_gpu \ " export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" @@ -38,5 +43,5 @@ export SCRIPT_ARGS=" \ " # This step is necessary because accelerate launch does not handle multiline arguments properly -export CMD="$LAUNCHER $PYTHON_FILE $ARGS" -srun $CMD \ No newline at end of file +export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" +srun $CMD