diff --git a/3.test_cases/1.megatron-lm/.gitignore b/3.test_cases/1.megatron-lm/.gitignore new file mode 100644 index 00000000..cd46aad1 --- /dev/null +++ b/3.test_cases/1.megatron-lm/.gitignore @@ -0,0 +1 @@ +gpt2 \ No newline at end of file diff --git a/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile b/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile index 601f00a1..5d336a16 100644 --- a/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile +++ b/3.test_cases/1.megatron-lm/0.distributed-training.Dockerfile @@ -99,7 +99,7 @@ RUN pip install transformers==4.21.0 sentencepiece ##################### # Install megatron-lm ##################### -RUN cd /workspace && git clone https://github.com/NVIDIA/Megatron-LM.git \ +RUN cd /workspace && git clone --depth 1 --branch core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git \ && cd Megatron-LM \ && python3 -m pip install nltk \ && python -m pip install . diff --git a/3.test_cases/1.megatron-lm/2.distributed-training.sbatch b/3.test_cases/1.megatron-lm/2.distributed-training.sbatch index b34ba37a..ed903fef 100644 --- a/3.test_cases/1.megatron-lm/2.distributed-training.sbatch +++ b/3.test_cases/1.megatron-lm/2.distributed-training.sbatch @@ -59,29 +59,35 @@ declare -a ARGS=( declare -a TORCHRUN_ARGS=( # change this to match the number of gpus per node: - --nproc_per_node=8 \ - --nnodes=$SLURM_JOB_NUM_NODES \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_backend=c10d \ - --rdzv_endpoint=$(hostname) \ + --nproc_per_node=8 + --nnodes=$SLURM_JOB_NUM_NODES + --rdzv_id=$SLURM_JOB_ID + --rdzv_backend=c10d + --rdzv_endpoint=$(hostname) ) declare -a MEGATRON_ARGS=( - --num-layers $NUM_LAYERS \ - --hidden-size $HIDDEN_SIZE \ - --num-attention-heads $NUM_ATTENTION_HEADS \ - --seq-length $SEQ_LENGTH \ - --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --global-batch-size $GLOBAL_BATCH_SIZE \ + --num-layers $NUM_LAYERS + --hidden-size $HIDDEN_SIZE + --num-attention-heads $NUM_ATTENTION_HEADS + --seq-length $SEQ_LENGTH + --max-position-embeddings $MAX_POSITION_EMBEDDINGS + --micro-batch-size $MICRO_BATCH_SIZE + --global-batch-size $GLOBAL_BATCH_SIZE ) declare -a MEGATRON_PARALLELISM=( - --tensor-model-parallel-size $TENSOR_PARALLEL \ - --pipeline-model-parallel-size $PIPELINE_PARALLEL \ + --tensor-model-parallel-size $TENSOR_PARALLEL + --pipeline-model-parallel-size $PIPELINE_PARALLEL ) -srun -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \ +AUTO_RESUME="" +if [ -d "/opt/sagemaker_cluster" ]; then + echo "Detected Hyperpod cluster.. enabling --auto-resume=1" + AUTO_RESUME="--auto-resume=1" +fi + +srun ${AUTO_RESUME} -l "${ARGS[@]}" python -m torch.distributed.run "${TORCHRUN_ARGS[@]}" /workspace/Megatron-LM/pretrain_gpt.py \ "${MEGATRON_PARALLELISM[@]}" \ "${MEGATRON_ARGS[@]}" \ --train-samples 146484375 \