From 396de2a28b27396ad7939223f7d9d4525bb14579 Mon Sep 17 00:00:00 2001 From: Trevor Gale Date: Mon, 11 Dec 2023 13:59:46 -0500 Subject: [PATCH] Update Megatron-LM scripts and integration for latest Docker container. (#55) * Update megatron-lm and scripts for new container. --- exp/dmoe/dmoe_125m_8gpu.sh | 2 +- exp/dmoe/dmoe_356m_8gpu.sh | 2 +- exp/dmoe/dmoe_46m_8gpu.sh | 2 +- exp/dmoe/dmoe_760m_8gpu.sh | 2 +- exp/gpt2/gpt2_125m_1gpu.sh | 2 +- exp/gpt2/gpt2_125m_8gpu.sh | 2 +- exp/gpt2/gpt2_1315m_1gpu.sh | 2 +- exp/gpt2/gpt2_1315m_8gpu.sh | 2 +- exp/gpt2/gpt2_356m_1gpu.sh | 2 +- exp/gpt2/gpt2_356m_8gpu.sh | 2 +- exp/gpt2/gpt2_46m_1gpu.sh | 2 +- exp/gpt2/gpt2_46m_8gpu.sh | 2 +- exp/gpt2/gpt2_760m_1gpu.sh | 2 +- exp/gpt2/gpt2_760m_8gpu.sh | 2 +- exp/moe/moe_125m_8gpu.sh | 2 +- exp/moe/moe_356m_8gpu.sh | 2 +- exp/moe/moe_46m_8gpu.sh | 2 +- third_party/Megatron-LM | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/exp/dmoe/dmoe_125m_8gpu.sh b/exp/dmoe/dmoe_125m_8gpu.sh index 3ccb5ecd..740e8a2e 100644 --- a/exp/dmoe/dmoe_125m_8gpu.sh +++ b/exp/dmoe/dmoe_125m_8gpu.sh @@ -156,7 +156,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MOE_ARGUMENTS} \ ${MODEL_ARGUMENTS} \ diff --git a/exp/dmoe/dmoe_356m_8gpu.sh b/exp/dmoe/dmoe_356m_8gpu.sh index cbf33b18..11c1ed60 100644 --- a/exp/dmoe/dmoe_356m_8gpu.sh +++ b/exp/dmoe/dmoe_356m_8gpu.sh @@ -157,7 +157,7 @@ EVALUATION_ARGUMENTS="\ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MOE_ARGUMENTS} \ ${MODEL_ARGUMENTS} \ diff --git a/exp/dmoe/dmoe_46m_8gpu.sh b/exp/dmoe/dmoe_46m_8gpu.sh index c6478aa8..c4f7eb65 100644 --- a/exp/dmoe/dmoe_46m_8gpu.sh +++ b/exp/dmoe/dmoe_46m_8gpu.sh @@ -156,7 +156,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MOE_ARGUMENTS} \ ${MODEL_ARGUMENTS} \ diff --git a/exp/dmoe/dmoe_760m_8gpu.sh b/exp/dmoe/dmoe_760m_8gpu.sh index 89a66f0a..e143bf54 100644 --- a/exp/dmoe/dmoe_760m_8gpu.sh +++ b/exp/dmoe/dmoe_760m_8gpu.sh @@ -156,7 +156,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MOE_ARGUMENTS} \ ${MODEL_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_125m_1gpu.sh b/exp/gpt2/gpt2_125m_1gpu.sh index 973aa745..217b47d2 100644 --- a/exp/gpt2/gpt2_125m_1gpu.sh +++ b/exp/gpt2/gpt2_125m_1gpu.sh @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_125m_8gpu.sh b/exp/gpt2/gpt2_125m_8gpu.sh index 58532f23..c06722c0 100644 --- a/exp/gpt2/gpt2_125m_8gpu.sh +++ b/exp/gpt2/gpt2_125m_8gpu.sh @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_1315m_1gpu.sh b/exp/gpt2/gpt2_1315m_1gpu.sh index 32986873..f89bbf0c 100644 --- a/exp/gpt2/gpt2_1315m_1gpu.sh +++ b/exp/gpt2/gpt2_1315m_1gpu.sh @@ -125,7 +125,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_1315m_8gpu.sh b/exp/gpt2/gpt2_1315m_8gpu.sh index 2c51643a..94bd301d 100644 --- a/exp/gpt2/gpt2_1315m_8gpu.sh +++ b/exp/gpt2/gpt2_1315m_8gpu.sh @@ -129,7 +129,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_356m_1gpu.sh b/exp/gpt2/gpt2_356m_1gpu.sh index 2462d435..4e03e8f8 100644 --- a/exp/gpt2/gpt2_356m_1gpu.sh +++ b/exp/gpt2/gpt2_356m_1gpu.sh @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_356m_8gpu.sh b/exp/gpt2/gpt2_356m_8gpu.sh index ff452eeb..231ad5ea 100644 --- a/exp/gpt2/gpt2_356m_8gpu.sh +++ b/exp/gpt2/gpt2_356m_8gpu.sh @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_46m_1gpu.sh b/exp/gpt2/gpt2_46m_1gpu.sh index 09bbbe25..ca3602a7 100644 --- a/exp/gpt2/gpt2_46m_1gpu.sh +++ b/exp/gpt2/gpt2_46m_1gpu.sh @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_46m_8gpu.sh b/exp/gpt2/gpt2_46m_8gpu.sh index 628b0ec6..6f8a861e 100644 --- a/exp/gpt2/gpt2_46m_8gpu.sh +++ b/exp/gpt2/gpt2_46m_8gpu.sh @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_760m_1gpu.sh b/exp/gpt2/gpt2_760m_1gpu.sh index 6893141e..1544f5a0 100644 --- a/exp/gpt2/gpt2_760m_1gpu.sh +++ b/exp/gpt2/gpt2_760m_1gpu.sh @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/gpt2/gpt2_760m_8gpu.sh b/exp/gpt2/gpt2_760m_8gpu.sh index 625b87ef..f445b39d 100644 --- a/exp/gpt2/gpt2_760m_8gpu.sh +++ b/exp/gpt2/gpt2_760m_8gpu.sh @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MODEL_ARGUMENTS} \ ${TRAINING_ARGUMENTS} \ diff --git a/exp/moe/moe_125m_8gpu.sh b/exp/moe/moe_125m_8gpu.sh index 5b81ef79..ff7c5e2e 100644 --- a/exp/moe/moe_125m_8gpu.sh +++ b/exp/moe/moe_125m_8gpu.sh @@ -160,7 +160,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MOE_ARGUMENTS} \ ${MODEL_ARGUMENTS} \ diff --git a/exp/moe/moe_356m_8gpu.sh b/exp/moe/moe_356m_8gpu.sh index 21d304c8..3ec4e78b 100644 --- a/exp/moe/moe_356m_8gpu.sh +++ b/exp/moe/moe_356m_8gpu.sh @@ -160,7 +160,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MOE_ARGUMENTS} \ ${MODEL_ARGUMENTS} \ diff --git a/exp/moe/moe_46m_8gpu.sh b/exp/moe/moe_46m_8gpu.sh index 4fc86e90..69637724 100644 --- a/exp/moe/moe_46m_8gpu.sh +++ b/exp/moe/moe_46m_8gpu.sh @@ -160,7 +160,7 @@ EVALUATION_ARGUMENTS="\ --log-interval 100 \ --eval-interval 1000" -python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \ +torchrun ${DISTRIBUTED_ARGUMENTS} \ third_party/Megatron-LM/pretrain_gpt.py \ ${MOE_ARGUMENTS} \ ${MODEL_ARGUMENTS} \ diff --git a/third_party/Megatron-LM b/third_party/Megatron-LM index f385caf9..3a9e3d8d 160000 --- a/third_party/Megatron-LM +++ b/third_party/Megatron-LM @@ -1 +1 @@ -Subproject commit f385caf934b84e71c946c4342362270edae02173 +Subproject commit 3a9e3d8de308e6f6398b59d16a8bd7177374f121