From 0042c47ed15fab63c6ccb8c4737ab4b143befea6 Mon Sep 17 00:00:00 2001
From: Tomonori Shimomura <59209764+shimomut@users.noreply.github.com>
Date: Thu, 18 Apr 2024 10:21:37 -0700
Subject: [PATCH] Enable auto-resume for the Megatron-LM app on HyperPod and
 fix FP32 assertion failure (#273)

* Use specific version of Megatron-LM, to avoid FP32 assertion failure

* Enable auto-resume on HyperPod

* Ignore training input files under gpt2/ directory.
---
 3.test_cases/1.megatron-lm/.gitignore         |  1 +
 .../0.distributed-training.Dockerfile         |  2 +-
 .../2.distributed-training.sbatch             | 36 +++++++++++--------
 3 files changed, 23 insertions(+), 16 deletions(-)
 create mode 100644 3.test_cases/1.megatron-lm/.gitignore

diff --git a/3.test_cases/1.megatron-lm/.gitignore b/3.test_cases/1.megatron-lm/.gitignore
new file mode 100644
index 00000000..cd46aad1
--- /dev/null
+++ b/3.test_cases/1.megatron-lm/.gitignore
@@ -0,0 +1 @@
+gpt2
\ No newline at end of file
diff --git a/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile b/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile
index 601f00a1..5d336a16 100644
--- a/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile
+++ b/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile
@@ -99,7 +99,7 @@ RUN pip install transformers==4.21.0 sentencepiece
 #####################
 # Install megatron-lm
 #####################
-RUN cd /workspace && git clone https://github.com/NVIDIA/Megatron-LM.git \
+RUN cd /workspace && git clone --depth 1 --branch core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git \
 	&& cd Megatron-LM \
 	&& python3 -m pip install nltk  \
 	&& python -m pip install .
diff --git a/3.test_cases/1.megatron-lm/2.distributed-training.sbatch b/3.test_cases/1.megatron-lm/2.distributed-training.sbatch
index b34ba37a..ed903fef 100644
--- a/3.test_cases/1.megatron-lm/2.distributed-training.sbatch
+++ b/3.test_cases/1.megatron-lm/2.distributed-training.sbatch
@@ -59,29 +59,35 @@ declare -a ARGS=(
 
 declare -a TORCHRUN_ARGS=(
     # change this to match the number of gpus per node:
-    --nproc_per_node=8 \
-    --nnodes=$SLURM_JOB_NUM_NODES \
-    --rdzv_id=$SLURM_JOB_ID \
-    --rdzv_backend=c10d \
-    --rdzv_endpoint=$(hostname) \
+    --nproc_per_node=8
+    --nnodes=$SLURM_JOB_NUM_NODES
+    --rdzv_id=$SLURM_JOB_ID
+    --rdzv_backend=c10d
+    --rdzv_endpoint=$(hostname)
 )
 
 declare -a MEGATRON_ARGS=(
-        --num-layers $NUM_LAYERS \
-        --hidden-size $HIDDEN_SIZE \
-        --num-attention-heads $NUM_ATTENTION_HEADS \
-        --seq-length $SEQ_LENGTH \
-        --max-position-embeddings $MAX_POSITION_EMBEDDINGS \
-        --micro-batch-size $MICRO_BATCH_SIZE \
-        --global-batch-size $GLOBAL_BATCH_SIZE \
+        --num-layers $NUM_LAYERS
+        --hidden-size $HIDDEN_SIZE
+        --num-attention-heads $NUM_ATTENTION_HEADS
+        --seq-length $SEQ_LENGTH
+        --max-position-embeddings $MAX_POSITION_EMBEDDINGS
+        --micro-batch-size $MICRO_BATCH_SIZE
+        --global-batch-size $GLOBAL_BATCH_SIZE
 )
 
 declare -a MEGATRON_PARALLELISM=(
-        --tensor-model-parallel-size $TENSOR_PARALLEL \
-        --pipeline-model-parallel-size $PIPELINE_PARALLEL \
+        --tensor-model-parallel-size $TENSOR_PARALLEL
+        --pipeline-model-parallel-size $PIPELINE_PARALLEL
 )
 
-srun -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \
+AUTO_RESUME=""
+if [ -d "/opt/sagemaker_cluster" ]; then
+    echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
+    AUTO_RESUME="--auto-resume=1"
+fi
+
+srun ${AUTO_RESUME} -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \
         "${MEGATRON_PARALLELISM[@]}" \
         "${MEGATRON_ARGS[@]}" \
         --train-samples 146484375 \