From f7254a250dc3b96625e83fb4b6c8902189a6ce44 Mon Sep 17 00:00:00 2001 From: Tomonori Shimomura <59209764+shimomut@users.noreply.github.com> Date: Tue, 16 Apr 2024 13:38:31 -0700 Subject: [PATCH] Enabling auto-resume in FSDP app on HyperPod. (#271) - Removed unnecessary back-slash characters in array declarations, as they are not compatible with auto-resume. --- .../10.FSDP/1.distributed-training.sbatch | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/3.test_cases/10.FSDP/1.distributed-training.sbatch b/3.test_cases/10.FSDP/1.distributed-training.sbatch index 0e20b147..e76d5129 100755 --- a/3.test_cases/10.FSDP/1.distributed-training.sbatch +++ b/3.test_cases/10.FSDP/1.distributed-training.sbatch @@ -39,11 +39,11 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 ########################### declare -a TORCHRUN_ARGS=( - --nproc_per_node=$GPUS_PER_NODE \ - --nnodes=$SLURM_JOB_NUM_NODES \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_backend=c10d \ - --rdzv_endpoint=$(hostname) \ + --nproc_per_node=$GPUS_PER_NODE + --nnodes=$SLURM_JOB_NUM_NODES + --rdzv_id=$SLURM_JOB_ID + --rdzv_backend=c10d + --rdzv_endpoint=$(hostname) ) export TORCHRUN=./pt_fsdp/bin/torchrun @@ -54,25 +54,31 @@ export TRAIN_SCRIPT=./train.py ############################ declare -a TRAINING_ARGS=( - --max_context_width=4096 \ - --num_key_value_heads=32 \ # 7b: 32 13b: 40 70b: 8 - --llama_intermediate_size=11008 \ # 7b: 11008 13b: 13824 70b: 28672 - --hidden_width=4096 \ # 7b: 4096 13b: 5120 70b: 8192 - --num_layers=32 \ # 7b: 32 13b: 40 70b: 80 - --num_heads=32 \ # 7b: 32 13b: 40 70b: 64 - --model_type=llama_v2 \ - --tokenizer="hf-internal-testing/llama-tokenizer" \ - --checkpoint_freq=5000 \ - --validation_freq=500 \ - --max_steps=5000 \ - --checkpoint_dir=./checkpoints \ - --dataset='c4' \ - --dataset_config_name='en' \ - --resume_from_checkpoint=./checkpoints \ - --train_batch_size=1 \ - --val_batch_size=1 \ - --sharding_strategy="full" \ # https://pytorch.org/docs/stable/fsdp.html + --max_context_width=4096 + --num_key_value_heads=32 # 7b: 32 13b: 40 70b: 8 + --llama_intermediate_size=11008 # 7b: 11008 13b: 13824 70b: 28672 + --hidden_width=4096 # 7b: 4096 13b: 5120 70b: 8192 + --num_layers=32 # 7b: 32 13b: 40 70b: 80 + --num_heads=32 # 7b: 32 13b: 40 70b: 64 + --model_type=llama_v2 + --tokenizer="hf-internal-testing/llama-tokenizer" + --checkpoint_freq=5000 + --validation_freq=500 + --max_steps=5000 + --checkpoint_dir=./checkpoints + --dataset='c4' + --dataset_config_name='en' + --resume_from_checkpoint=./checkpoints + --train_batch_size=1 + --val_batch_size=1 + --sharding_strategy="full" # https://pytorch.org/docs/stable/fsdp.html --offload_activations=1 ) -srun -l ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}" +AUTO_RESUME="" +if [ -d "/opt/sagemaker_cluster" ]; then + echo "Detected Hyperpod cluster.. enabling --auto-resume=1" + AUTO_RESUME="--auto-resume=1" +fi + +srun ${AUTO_RESUME} -l ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"