Enable auto-resume for the Megatron-LM app on HyperPod and fix FP32 a…

…ssertion failure (#273) * Use specific version of Megatron-LM, to avoid FP32 assertion failure * Enable auto-resume on HyperPod * Ignore training input files under gpt2/ directory.
aws-samples · Apr 18, 2024 · 0042c47 · 0042c47
1 parent f7254a2
commit 0042c47
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 16 deletions.
diff --git a/3.test_cases/1.megatron-lm/.gitignore b/3.test_cases/1.megatron-lm/.gitignore
@@ -0,0 +1 @@
+gpt2
diff --git a/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile b/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile
@@ -99,7 +99,7 @@ RUN pip install transformers==4.21.0 sentencepiece
 #####################
 # Install megatron-lm
 #####################
-RUN cd /workspace && git clone https://github.com/NVIDIA/Megatron-LM.git \
+RUN cd /workspace && git clone --depth 1 --branch core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git \
 	&& cd Megatron-LM \
 	&& python3 -m pip install nltk  \
 	&& python -m pip install .

diff --git a/3.test_cases/1.megatron-lm/2.distributed-training.sbatch b/3.test_cases/1.megatron-lm/2.distributed-training.sbatch
@@ -59,29 +59,35 @@ declare -a ARGS=(
 
 declare -a TORCHRUN_ARGS=(
     # change this to match the number of gpus per node:
-    --nproc_per_node=8 \
-    --nnodes=$SLURM_JOB_NUM_NODES \
-    --rdzv_id=$SLURM_JOB_ID \
-    --rdzv_backend=c10d \
-    --rdzv_endpoint=$(hostname) \
+    --nproc_per_node=8
+    --nnodes=$SLURM_JOB_NUM_NODES
+    --rdzv_id=$SLURM_JOB_ID
+    --rdzv_backend=c10d
+    --rdzv_endpoint=$(hostname)
 )
 
 declare -a MEGATRON_ARGS=(
-        --num-layers $NUM_LAYERS \
-        --hidden-size $HIDDEN_SIZE \
-        --num-attention-heads $NUM_ATTENTION_HEADS \
-        --seq-length $SEQ_LENGTH \
-        --max-position-embeddings $MAX_POSITION_EMBEDDINGS \
-        --micro-batch-size $MICRO_BATCH_SIZE \
-        --global-batch-size $GLOBAL_BATCH_SIZE \
+        --num-layers $NUM_LAYERS
+        --hidden-size $HIDDEN_SIZE
+        --num-attention-heads $NUM_ATTENTION_HEADS
+        --seq-length $SEQ_LENGTH
+        --max-position-embeddings $MAX_POSITION_EMBEDDINGS
+        --micro-batch-size $MICRO_BATCH_SIZE
+        --global-batch-size $GLOBAL_BATCH_SIZE
 )
 
 declare -a MEGATRON_PARALLELISM=(
-        --tensor-model-parallel-size $TENSOR_PARALLEL \
-        --pipeline-model-parallel-size $PIPELINE_PARALLEL \
+        --tensor-model-parallel-size $TENSOR_PARALLEL
+        --pipeline-model-parallel-size $PIPELINE_PARALLEL
 )
 
-srun -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \
+AUTO_RESUME=""
+if [ -d "/opt/sagemaker_cluster" ]; then
+    echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
+    AUTO_RESUME="--auto-resume=1"
+fi
+
+srun ${AUTO_RESUME} -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \
         "${MEGATRON_PARALLELISM[@]}" \
         "${MEGATRON_ARGS[@]}" \
         --train-samples 146484375 \