From 0042c47ed15fab63c6ccb8c4737ab4b143befea6 Mon Sep 17 00:00:00 2001 From: Tomonori Shimomura <59209764+shimomut@users.noreply.github.com> Date: Thu, 18 Apr 2024 10:21:37 -0700 Subject: [PATCH] Enable auto-resume for the Megatron-LM app on HyperPod and fix FP32 assertion failure (#273) * Use specific version of Megatron-LM, to avoid FP32 assertion failure * Enable auto-resume on HyperPod * Ignore training input files under gpt2/ directory. --- 3.test_cases/1.megatron-lm/.gitignore | 1 + .../0.distributed-training.Dockerfile | 2 +- .../2.distributed-training.sbatch | 36 +++++++++++-------- 3 files changed, 23 insertions(+), 16 deletions(-) create mode 100644 3.test_cases/1.megatron-lm/.gitignore diff --git a/3.test_cases/1.megatron-lm/.gitignore b/3.test_cases/1.megatron-lm/.gitignore new file mode 100644 index 00000000..cd46aad1 --- /dev/null +++ b/3.test_cases/1.megatron-lm/.gitignore @@ -0,0 +1 @@ +gpt2 \ No newline at end of file diff --git a/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile b/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile index 601f00a1..5d336a16 100644 --- a/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile +++ b/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile @@ -99,7 +99,7 @@ RUN pip install transformers==4.21.0 sentencepiece ##################### # Install megatron-lm ##################### -RUN cd /workspace && git clone https://github.com/NVIDIA/Megatron-LM.git \ +RUN cd /workspace && git clone --depth 1 --branch core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git \ && cd Megatron-LM \ && python3 -m pip install nltk \ && python -m pip install . diff --git a/3.test_cases/1.megatron-lm/2.distributed-training.sbatch b/3.test_cases/1.megatron-lm/2.distributed-training.sbatch index b34ba37a..ed903fef 100644 --- a/3.test_cases/1.megatron-lm/2.distributed-training.sbatch +++ b/3.test_cases/1.megatron-lm/2.distributed-training.sbatch @@ -59,29 +59,35 @@ declare -a ARGS=( declare -a TORCHRUN_ARGS=( # change this to match the number of gpus per node: - --nproc_per_node=8 \ - --nnodes=$SLURM_JOB_NUM_NODES \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_backend=c10d \ - --rdzv_endpoint=$(hostname) \ + --nproc_per_node=8 + --nnodes=$SLURM_JOB_NUM_NODES + --rdzv_id=$SLURM_JOB_ID + --rdzv_backend=c10d + --rdzv_endpoint=$(hostname) ) declare -a MEGATRON_ARGS=( - --num-layers $NUM_LAYERS \ - --hidden-size $HIDDEN_SIZE \ - --num-attention-heads $NUM_ATTENTION_HEADS \ - --seq-length $SEQ_LENGTH \ - --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --global-batch-size $GLOBAL_BATCH_SIZE \ + --num-layers $NUM_LAYERS + --hidden-size $HIDDEN_SIZE + --num-attention-heads $NUM_ATTENTION_HEADS + --seq-length $SEQ_LENGTH + --max-position-embeddings $MAX_POSITION_EMBEDDINGS + --micro-batch-size $MICRO_BATCH_SIZE + --global-batch-size $GLOBAL_BATCH_SIZE ) declare -a MEGATRON_PARALLELISM=( - --tensor-model-parallel-size $TENSOR_PARALLEL \ - --pipeline-model-parallel-size $PIPELINE_PARALLEL \ + --tensor-model-parallel-size $TENSOR_PARALLEL + --pipeline-model-parallel-size $PIPELINE_PARALLEL ) -srun -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \ +AUTO_RESUME="" +if [ -d "/opt/sagemaker_cluster" ]; then + echo "Detected Hyperpod cluster.. enabling --auto-resume=1" + AUTO_RESUME="--auto-resume=1" +fi + +srun ${AUTO_RESUME} -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \ "${MEGATRON_PARALLELISM[@]}" \ "${MEGATRON_ARGS[@]}" \ --train-samples 146484375 \