Merge pull request #4 from mosaicml/main

merging from main
mosaicml · Oct 20, 2023 · b179b07 · b179b07
2 parents 5981ade + b2a43a1
commit b179b07
Show file tree

Hide file tree

Showing 7 changed files with 675 additions and 349 deletions.
diff --git a/scripts/misc/update_hub_code.py b/scripts/misc/update_hub_code.py
@@ -14,8 +14,24 @@
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
 
+_ALL_MODELS = [
+    'mosaicml/mpt-7b',
+    'mosaicml/mpt-7b-instruct',
+    'mosaicml/mpt-7b-chat',
+    'mosaicml/mpt-30b',
+    'mosaicml/mpt-30b-chat',
+    'mosaicml/mpt-30b-instruct',
+    'mosaicml/mpt-7b-8k',
+    'mosaicml/mpt-7b-8k-instruct',
+    'mosaicml/mpt-7b-8k-chat',
+    'mosaicml/mpt-7b-storywriter',
+]
+
 
 def main(hf_repos_for_upload: List[str]):
+    if len(hf_repos_for_upload) == 1 and hf_repos_for_upload[0] == 'all':
+        hf_repos_for_upload = _ALL_MODELS
+
     current_datetime = datetime.now()
     formatted_datetime = current_datetime.strftime('%B %d, %Y %H:%M:%S')
 
@@ -61,7 +77,7 @@ def main(hf_repos_for_upload: List[str]):
                 create_pr=True,
             )
 
-            print(f'PR opened: {result}')
+            print(f'PR opened: {result}\n')
 
 
 if __name__ == '__main__':

diff --git a/scripts/train/benchmarking/README.md b/scripts/train/benchmarking/README.md
diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py
@@ -6,9 +6,10 @@
 import math
 from typing import Any, Dict, List, Union
 
-from mcli import sdk as msdk
+from composer.callbacks.speed_monitor import \
+    GPU_AVAILABLE_FLOPS as GPU_FLOP_DICT
 
-GPU_AVAILABLE_FLOPS = 312_000_000_000_000
+from mcli import sdk as msdk
 
 
 def str_to_bool(value: Union[bool, str]):
@@ -46,13 +47,19 @@ def parse_args():
 
 
 def get_runs(args: argparse.Namespace):
-    runs = [r for r in msdk.get_runs() if args.project in r.name]
+    runs = [
+        r for r in msdk.get_runs(include_details=True)
+        if args.project in r.name.split('-')[0] and
+        r.status == msdk.RunStatus('COMPLETED')
+    ]
     for filter in args.filters:
         runs = [r for r in runs if filter in r.name]
 
     def sort_key(r: msdk.Run):
         model_name = r.name.split('-')[2]
-        num_gpu = r.config.gpu_num
+        num_gpu = r.gpus
+        gpu_type = r.gpu_type
+        model_precision = r.submitted_config.parameters['precision']
         if model_name[-1] == 'm':
             model_name_size = 1e6
         elif model_name[-1] == 'b':
@@ -61,9 +68,12 @@ def sort_key(r: msdk.Run):
             print(model_name)
             raise ValueError
         model_size = int(model_name[:-1])
-        return (model_name_size, model_size, r.config.parameters['max_seq_len'],
-                num_gpu, r.config.parameters['global_train_batch_size'])
+        return (gpu_type, model_precision, model_name_size, model_size,
+                r.submitted_config.parameters['max_seq_len'], num_gpu,
+                r.submitted_config.parameters['global_train_batch_size'])
 
+    unique_runs = {sort_key(i): i for i in runs}
+    runs = [unique_runs[r] for r in unique_runs]
     runs.sort(reverse=True, key=sort_key)
 
     return runs
@@ -83,17 +93,7 @@ def filter_runs(runs: List[msdk.Run]):
 
     pop_runs = []
     for run in runs:
-        if run.status in [
-                msdk.RunStatus('FAILED_PULL'),
-                msdk.RunStatus('PENDING'),
-                msdk.RunStatus('QUEUED'),
-                msdk.RunStatus('RUNNING'),
-                msdk.RunStatus('SCHEDULED'),
-                msdk.RunStatus('STARTING'),
-                msdk.RunStatus('STOPPED'),
-                msdk.RunStatus('STOPPING'),
-                msdk.RunStatus('TERMINATING'),
-        ]:
+        if run.status != msdk.RunStatus('COMPLETED'):
             print(f'run {run.name} has run status {run.status}')
             pop_runs.append(run)
     for run in pop_runs:
@@ -106,13 +106,22 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
     n_params = micro_batchsize = throughput = -1
 
     model_name = run.name.split('-')[2]
-    gpu_num = run.config.gpu_num
-    gpu_type = run.config.gpu_type
-
-    fsdp_config = run.config.parameters['fsdp_config']
-
-    seq_len = run.config.parameters['max_seq_len']
-    global_train_batch_size = run.config.parameters['global_train_batch_size']
+    gpus = run.gpus
+    gpu_type = run.gpu_type
+
+    if 'h100' in gpu_type:
+        gpu_type = 'h100-sxm'
+    if 'a100' in gpu_type:
+        gpu_type = 'a100'
+    GPU_AVAILABLE_FLOPS = GPU_FLOP_DICT[gpu_type][
+        run.submitted_config.parameters['precision']]
+
+    gpu_type = run.gpu_type
+    fsdp_config = run.submitted_config.parameters['fsdp_config']
+
+    seq_len = run.submitted_config.parameters['max_seq_len']
+    global_train_batch_size = run.submitted_config.parameters[
+        'global_train_batch_size']
     activation_checkpointing = fsdp_config['activation_checkpointing']
 
     logs = msdk.get_run_logs(run)
@@ -138,8 +147,8 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
             throughput = float(line.split(' ')[-1])
             break
 
-    d_model = run.config.parameters['model']['d_model']
-    n_layers = run.config.parameters['model']['n_layers']
+    d_model = run.submitted_config.parameters['model']['d_model']
+    n_layers = run.submitted_config.parameters['model']['n_layers']
 
     # mfu is approximated using thoughtput and param count
     # the number of paramters is approximately the number of multiply-accumulates (MAC) in the network
@@ -153,43 +162,48 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
     attn_flops_per_seq = n_layers * 2 * 2 * (d_model * (seq_len**2))
     # there are 2 ops in bwd pass and 1 in fwd pass so we mult by 3
     mfu_w_attn = (3 * flops_per_seq + 3 * attn_flops_per_seq) * throughput / (
-        gpu_num * GPU_AVAILABLE_FLOPS)
+        gpus * GPU_AVAILABLE_FLOPS)
 
     if activation_checkpointing:
         hfu_w_attn = (4 * flops_per_seq + 4 * attn_flops_per_seq
-                     ) * throughput / (gpu_num * GPU_AVAILABLE_FLOPS)
+                     ) * throughput / (gpus * GPU_AVAILABLE_FLOPS)
     else:
         hfu_w_attn = mfu_w_attn
 
+    model_tflop = int(
+        (3 * flops_per_seq + 3 * attn_flops_per_seq) * throughput / gpus / 1e12)
+
     return {
         'Model':
             model_name,
         'SeqLen (T)':
             seq_len,
         '# GPUs':
-            gpu_num,
+            gpus,
         'GPU':
             gpu_type,
         'MFU':
             round(mfu_w_attn * 100, 2),
         'HFU':
             round(hfu_w_attn * 100, 2),
+        'Model TFLOP':
+            model_tflop,
         'MicroBatchSize':
             micro_batchsize,
         'GradAccum':
-            math.ceil(global_train_batch_size / gpu_num / micro_batchsize),
+            math.ceil(global_train_batch_size / gpus / micro_batchsize),
         'GlobalBatchSize':
             global_train_batch_size,
         'Throughput (S/s)':
             int(throughput),
         'Throughput (T/s)':
             int(throughput * seq_len),
         'Throughput (T/s/GPU)':
-            int(throughput * seq_len / gpu_num),
+            int(throughput * seq_len / gpus),
         'GlobalBatchSize (T)':
             global_train_batch_size * seq_len,
         'Precision':
-            run.config.parameters['precision'],
+            run.submitted_config.parameters['precision'],
         'MP Mode':
             fsdp_config['mixed_precision'],
         'Sharding Strategy':