Skip to content

Commit

Permalink
Enable auto-resume for the Megatron-LM app on HyperPod and fix FP32 a…
Browse files Browse the repository at this point in the history
…ssertion failure (#273)

* Use specific version of Megatron-LM, to avoid FP32 assertion failure

* Enable auto-resume on HyperPod

* Ignore training input files under gpt2/ directory.
  • Loading branch information
shimomut authored Apr 18, 2024
1 parent f7254a2 commit 0042c47
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 16 deletions.
1 change: 1 addition & 0 deletions 3.test_cases/1.megatron-lm/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
gpt2
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ RUN pip install transformers==4.21.0 sentencepiece
#####################
# Install megatron-lm
#####################
RUN cd /workspace && git clone https://github.com/NVIDIA/Megatron-LM.git \
RUN cd /workspace && git clone --depth 1 --branch core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git \
&& cd Megatron-LM \
&& python3 -m pip install nltk \
&& python -m pip install .
Expand Down
36 changes: 21 additions & 15 deletions 3.test_cases/1.megatron-lm/2.distributed-training.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -59,29 +59,35 @@ declare -a ARGS=(

declare -a TORCHRUN_ARGS=(
# change this to match the number of gpus per node:
--nproc_per_node=8 \
--nnodes=$SLURM_JOB_NUM_NODES \
--rdzv_id=$SLURM_JOB_ID \
--rdzv_backend=c10d \
--rdzv_endpoint=$(hostname) \
--nproc_per_node=8
--nnodes=$SLURM_JOB_NUM_NODES
--rdzv_id=$SLURM_JOB_ID
--rdzv_backend=c10d
--rdzv_endpoint=$(hostname)
)

declare -a MEGATRON_ARGS=(
--num-layers $NUM_LAYERS \
--hidden-size $HIDDEN_SIZE \
--num-attention-heads $NUM_ATTENTION_HEADS \
--seq-length $SEQ_LENGTH \
--max-position-embeddings $MAX_POSITION_EMBEDDINGS \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--num-layers $NUM_LAYERS
--hidden-size $HIDDEN_SIZE
--num-attention-heads $NUM_ATTENTION_HEADS
--seq-length $SEQ_LENGTH
--max-position-embeddings $MAX_POSITION_EMBEDDINGS
--micro-batch-size $MICRO_BATCH_SIZE
--global-batch-size $GLOBAL_BATCH_SIZE
)

declare -a MEGATRON_PARALLELISM=(
--tensor-model-parallel-size $TENSOR_PARALLEL \
--pipeline-model-parallel-size $PIPELINE_PARALLEL \
--tensor-model-parallel-size $TENSOR_PARALLEL
--pipeline-model-parallel-size $PIPELINE_PARALLEL
)

srun -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \
AUTO_RESUME=""
if [ -d "/opt/sagemaker_cluster" ]; then
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
AUTO_RESUME="--auto-resume=1"
fi

srun ${AUTO_RESUME} -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \
"${MEGATRON_PARALLELISM[@]}" \
"${MEGATRON_ARGS[@]}" \
--train-samples 146484375 \
Expand Down

0 comments on commit 0042c47

Please sign in to comment.