Merge branch 'main' into anna/license-updates

mosaicml · Jan 7, 2024 · 9fa0de6 · 9fa0de6
2 parents 168f2a0 + 5e85bd6
commit 9fa0de6
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 6 deletions.
diff --git a/scripts/train/benchmarking/README.md b/scripts/train/benchmarking/README.md
@@ -69,6 +69,29 @@ Our microbatching engine enables microbatch sizes that do not divde Global Batch
 
 [comment]: # (TODO: Update tables with torch 2.0 after next Composer release)
 
+## H100 80GB BF16 (Large Scale, >= 128 GPUs)
+|  Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | Model TFLOP | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams |
+|  --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+|  70b | 2048 | 512 | h100_80gb | 41.25 | 55.0 | 408 | 8 | 1 | 4096 | 251 | 515636 | 1007 | 8388608 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 64862437376 |
+|  70b | 2048 | 256 | h100_80gb | 42.42 | 56.56 | 419 | 8 | 1 | 2048 | 129 | 265149 | 1035 | 4194304 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 64862437376 |
+|  70b | 2048 | 128 | h100_80gb | 43.36 | 57.81 | 428 | 8 | 1 | 1024 | 66 | 135490 | 1058 | 2097152 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 64862437376 |
+|  30b | 2048 | 512 | h100_80gb | 40.27 | 53.69 | 398 | 8 | 1 | 4096 | 528 | 1083366 | 2115 | 8388608 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29975214080 |
+|  30b | 2048 | 256 | h100_80gb | 40.89 | 54.52 | 404 | 8 | 1 | 2048 | 268 | 550022 | 2148 | 4194304 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29975214080 |
+|  30b | 2048 | 128 | h100_80gb | 41.85 | 55.8 | 414 | 8 | 1 | 1024 | 137 | 281491 | 2199 | 2097152 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29975214080 |
+|  13b | 2048 | 512 | h100_80gb | 41.12 | 54.83 | 406 | 16 | 1 | 8192 | 1238 | 2535811 | 4952 | 16777216 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12853954560 |
+|  13b | 2048 | 256 | h100_80gb | 41.42 | 55.23 | 409 | 16 | 1 | 4096 | 623 | 1277214 | 4989 | 8388608 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12853954560 |
+|  13b | 2048 | 128 | h100_80gb | 42.18 | 56.24 | 417 | 16 | 1 | 2048 | 317 | 650264 | 5080 | 4194304 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12853954560 |
+|  7b | 2048 | 512 | h100_80gb | 42.2 | 42.2 | 417 | 6 | 1 | 3072 | 2417 | 4951479 | 9670 | 6291456 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 6658859008 |
+|  7b | 2048 | 256 | h100_80gb | 44.15 | 44.15 | 436 | 6 | 1 | 1536 | 1264 | 2590548 | 10119 | 3145728 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 6658859008 |
+|  7b | 2048 | 128 | h100_80gb | 45.71 | 45.71 | 452 | 6 | 1 | 768 | 654 | 1340830 | 10475 | 1572864 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 6658859008 |
+|  3b | 2048 | 512 | h100_80gb | 39.24 | 39.24 | 388 | 8 | 1 | 4096 | 5416 | 11092218 | 21664 | 8388608 | amp_bf16 | DEFAULT | SHARD_GRAD_OP | False | False | 2651837440 |
+|  3b | 2048 | 256 | h100_80gb | 41.25 | 41.25 | 408 | 8 | 1 | 2048 | 2846 | 5829686 | 22772 | 4194304 | amp_bf16 | DEFAULT | SHARD_GRAD_OP | False | False | 2651837440 |
+|  3b | 2048 | 128 | h100_80gb | 42.43 | 42.43 | 419 | 8 | 1 | 1024 | 1463 | 2998098 | 23422 | 2097152 | amp_bf16 | DEFAULT | SHARD_GRAD_OP | False | False | 2651837440 |
+|  1b | 2048 | 512 | h100_80gb | 36.65 | 36.65 | 362 | 12 | 1 | 6144 | 9959 | 20396905 | 39837 | 12582912 | amp_bf16 | DEFAULT | SHARD_GRAD_OP | False | False | 1315950592 |
+|  1b | 2048 | 256 | h100_80gb | 39.15 | 39.15 | 387 | 12 | 1 | 3072 | 5319 | 10894207 | 42555 | 6291456 | amp_bf16 | DEFAULT | SHARD_GRAD_OP | False | False | 1315950592 |
+|  1b | 2048 | 128 | h100_80gb | 40.6 | 40.6 | 401 | 12 | 1 | 1536 | 2757 | 5647854 | 44123 | 3145728 | amp_bf16 | DEFAULT | SHARD_GRAD_OP | False | False | 1315950592 |
+
+
 ## H100 80GB BF16
 |  Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | Model TFLOP | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams |
 |  --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |

diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py
@@ -376,7 +376,7 @@ def get_integrations(project: str,
     git_integration.update({
         'integration_type': 'git_repo',
         'git_repo': 'mosaicml/llm-foundry',
-        'pip_install': '-e .[gpu]'
+        'pip_install': '.[gpu-flash2]'
     })
 
     integrations = [git_integration]
@@ -398,8 +398,8 @@ def run_config(config: Tuple[str, int, int, str, str, int, str],
         {
             'integration_type': 'git_repo',
             'git_repo': 'mosaicml/llm-foundry',
-            'git_branch': 'v0.4.0',
-            'pip_install': '-e .[gpu]',
+            'git_branch': 'main',
+            'pip_install': '.[gpu-flash2]',
         },
         {
             'integration_type': 'wandb',
@@ -411,7 +411,7 @@ def run_config(config: Tuple[str, int, int, str, str, int, str],
     command = ''
     if gpu_type == 'h100_80gb' and 'fp8' in precision:  # Required for flash-attn and FP8 training
         command += f"""
-        pip install flash-attn==1.0.7 --no-build-isolation
+        pip install flash-attn==2.4.2 --no-build-isolation
         pip install git+https://github.com/NVIDIA/[email protected]
         pip uninstall install pydantic --yes
         pip install pydantic==1.9.0
@@ -420,11 +420,11 @@ def run_config(config: Tuple[str, int, int, str, str, int, str],
     if args.data_remote is None:
         command += f"""
             cd llm-foundry/scripts
-            python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --tokenizer gpt2 --eos_text '<|endoftext|>'
+            python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
             composer train/train.py /mnt/config/parameters.yaml
             """
     else:
-        command = f"""
+        command += f"""
             cd llm-foundry/scripts
             composer train/train.py /mnt/config/parameters.yaml
             """
@@ -487,6 +487,7 @@ def run_config(config: Tuple[str, int, int, str, str, int, str],
         print(f'Launching run {run.name}')
     else:
         print(f'run = {name}')
+        print(f'{config=}')
 
 
 def run_check_capacity(model_yaml: str,

diff --git a/scripts/train/benchmarking/sweep.py b/scripts/train/benchmarking/sweep.py
@@ -0,0 +1,91 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+# Define the arguments to sweep over
+
+base_args = [
+    '--project tput',
+    '--image <insert_image_name>',
+    '--git_branch main',
+    '--precisions bf16',
+    '--fsdp_config_mixed_precision PURE',
+    '--fsdp_config_limit_all_gathers true',
+    '--fsdp_config_forward_prefetch true',
+    '--fsdp_config_backward_prefetch BACKWARD_PRE',
+    '--activation_cpu_offload false',
+    '--seq_len_exp 11 11',
+    '--accum 1',
+    '--clusters <insert_cluster_name>',
+    '--gpu_types h100_80gb',
+    '--data_remote <insert_data_remote_location>',
+    '--wandb true',
+    '--priority lowest',
+    '--RUN true',
+]
+
+num_gpu_args_list = [
+    [
+        '--gpu_nums 128',
+    ],
+    [
+        '--gpu_nums 256',
+    ],
+    [
+        '--gpu_nums 512',
+    ],
+]
+
+model_args_list = [
+    [
+        '--model_yamls 1b.yaml',
+        '--fsdp_config_activation_checkpointing false',
+        '--fsdp_config_shard_strategy SHARD_GRAD_OP',
+        '--microbatch_size 12',
+        '--attn_impl flash',
+    ],
+    [
+        '--model_yamls 3b.yaml',
+        '--fsdp_config_activation_checkpointing false',
+        '--fsdp_config_shard_strategy SHARD_GRAD_OP',
+        '--microbatch_size 8',
+        '--attn_impl flash',
+    ],
+    [
+        '--model_yamls 7b.yaml',
+        '--fsdp_config_activation_checkpointing false',
+        '--fsdp_config_shard_strategy FULL_SHARD',
+        '--microbatch_size 6',
+        '--attn_impl flash',
+    ],
+    [
+        '--model_yamls 13b.yaml',
+        '--fsdp_config_activation_checkpointing true',
+        '--fsdp_config_shard_strategy FULL_SHARD',
+        '--microbatch_size 16',
+        '--attn_impl triton',
+    ],
+    [
+        '--model_yamls 30b.yaml',
+        '--fsdp_config_activation_checkpointing true',
+        '--fsdp_config_shard_strategy FULL_SHARD',
+        '--microbatch_size 8',
+        '--attn_impl triton',
+    ],
+    [
+        '--model_yamls 70b.yaml',
+        '--fsdp_config_activation_checkpointing true',
+        '--fsdp_config_shard_strategy FULL_SHARD',
+        '--microbatch_size 8',
+        '--attn_impl flash',
+    ],
+]
+
+# Iterate over the arguments and call submit_benchmarks.py
+for num_gpu_args in num_gpu_args_list:
+    for model_args in model_args_list:
+        command = ['python submit_benchmarks.py'
+                  ] + base_args + num_gpu_args + model_args
+        command = ' '.join(command)
+        os.system(command)