wip

mosaicml · Sep 18, 2023 · 79f62f4 · 79f62f4
1 parent 7ec2fe0
commit 79f62f4
Show file tree

Hide file tree

Showing 2 changed files with 193 additions and 0 deletions.
diff --git a/.github/workflows/regression_yamls/mpt-125m-elastic-resumption.yaml b/.github/workflows/regression_yamls/mpt-125m-elastic-resumption.yaml
@@ -0,0 +1,119 @@
+integrations:
+- integration_type: git_repo
+  git_repo: mosaicml/llm-foundry
+  git_branch: main
+  pip_install: -e .[gpu]
+
+command: |
+  cd llm-foundry/scripts
+  python data_prep/convert_dataset_hf.py \
+    --dataset c4 --data_subset en \
+    --out_root ./my-copy-c4 --splits train_small val_small \
+    --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
+  # Composer command will be inserted below:
+
+image: mosaicml/llm-foundry:2.0.1_cu118-latest
+name: mpt-125-elastic-resumption
+
+compute:
+  gpus: <INPUT>
+
+parameters:
+  data_local: ./my-copy-c4
+  data_remote:
+  max_seq_len: 2048
+  global_seed: 17
+
+  model:
+    name: mpt_causal_lm
+    init_device: meta
+    d_model: 768
+    n_heads: 12
+    n_layers: 12
+    expansion_ratio: 4
+    max_seq_len: ${max_seq_len}
+    vocab_size: 50368
+    attn_config:
+      attn_impl: triton
+
+  tokenizer:
+    name: EleutherAI/gpt-neox-20b
+    kwargs:
+      model_max_length: ${max_seq_len}
+
+  train_loader:
+    name: text
+    dataset:
+      local: ${data_local}
+      remote: ${data_remote}
+      split: train_small
+      shuffle: true
+      max_seq_len: ${max_seq_len}
+      shuffle_seed: ${global_seed}
+    drop_last: true
+    num_workers: 8
+
+  eval_loader:
+    name: text
+    dataset:
+      local: ${data_local}
+      remote: ${data_remote}
+      split: val_small
+      shuffle: false
+      max_seq_len: ${max_seq_len}
+      shuffle_seed: ${global_seed}
+    drop_last: false
+    num_workers: 8
+
+  scheduler:
+    name: cosine_with_warmup
+    t_warmup: 100ba
+    alpha_f: 0.1
+
+  optimizer:
+    name: decoupled_adamw
+    lr: 6.0e-4
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 0.0
+
+  algorithms:
+    gradient_clipping:
+      clipping_type: norm
+      clipping_threshold: 1.0
+
+  max_duration: <INPUT>
+  eval_interval: 500ba
+  eval_first: false
+  eval_subset_num_batches: -1
+  global_train_batch_size: 256
+
+  seed: ${global_seed}
+  device_eval_batch_size: 16
+  device_train_microbatch_size: auto
+  precision: amp_bf16
+
+  fsdp_config:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
+    verbose: false
+    state_dict_type: sharded
+
+  progress_bar: false
+  log_to_console: true
+  console_log_interval: 1ba
+
+  callbacks:
+    speed_monitor:
+      window_size: 10
+    lr_monitor: {}
+    memory_monitor: {}
+    runtime_estimator: {}
+
+  save_interval: 10ba
diff --git a/.github/workflows/test_elastic_resumption.py b/.github/workflows/test_elastic_resumption.py
@@ -0,0 +1,74 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import datetime
+import os
+import subprocess
+import logging
+
+log = logging.getLogger(__name__)
+
+DIR_PATH = os.path.dirname(os.path.abspath(__file__))
+REGRESSIONS_DIR = os.path.join(DIR_PATH, 'regression_yamls')
+
+from mcli import RunConfig, RunStatus, create_run, wait_for_run_status
+
+def test_elastic_resumption(cluster: str, save_folder: str, wandb_entity: str,
+                            wandb_project: str, git_repo: str, git_branch: str):
+    def create_run_and_wait(gpus: int, resume: bool, subdir: str):
+        config = RunConfig.from_file(
+            os.path.join(REGRESSIONS_DIR, 'mpt-125m-elastic-resumption.yaml'))
+
+        # Add the command to train our model
+        composer_command = '\ncomposer train/train.py /mnt/config/parameters.yaml'
+        if resume:
+            composer_command += ' autoresume=true' # TODO: autoresume and save_overwrite can't both be true, but i have to overwrite if i run multiple runs with same save folder
+        else:
+            composer_command += ' save_overwrite=true autoresume=false'
+        config.command += composer_command
+
+        # Add suffix to name
+        name_suffix = f'-{gpus}'
+        if resume:
+            name_suffix += '-resume'
+        config.name += name_suffix
+
+        # Set other parameters
+        config.cluster = cluster
+        config.compute['gpus'] = gpus
+        config.parameters['save_folder'] = os.path.join(save_folder, subdir)
+        config.parameters['max_duration'] = '20ba' if resume else '10ba'
+        commit_hash = subprocess.check_output(['git', 'rev-parse',
+                                            'HEAD']).strip().decode('utf-8')
+        timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S')
+        wandb_group = f'{timestamp}::{commit_hash}'
+        wandb_config = {
+            'entity': wandb_entity,
+            'project': wandb_project,
+            'group': wandb_group
+        }
+        config.parameters['loggers'] = config.parameters.get('loggers', {})
+        config.parameters['loggers']['wandb'] = wandb_config
+        config.integrations[0]['git_repo'] = git_repo
+        config.integrations[0]['git_branch'] = git_branch
+
+        # Start run
+        run = create_run(config)
+        log.info(f'Starting run {run.name}')
+        wait_for_run_status(run, RunStatus.COMPLETED) # Wait for the run to complete or terminate.
+        if run.status != RunStatus.COMPLETED:
+            raise Exception(f'Failure on run {run.name}. Run status is {run.status}. Terminating test.')
+        log.info(f'Completed run {run.name}')
+
+    # Test 1 node => 2 node elastic resumption
+    subdir = '1_to_2_node'
+    create_run_and_wait(gpus=8, resume=False, subdir=subdir)
+    create_run_and_wait(gpus=16, resume=True, subdir=subdir)
+
+    # Test 2 node => 1 node elastic resumption
+    subdir = '2_to_1_node'
+    create_run_and_wait(gpus=16, resume=False, subdir=subdir)
+    create_run_and_wait(gpus=8, resume=True, subdir=subdir)
+
+if __name__ == '__main__':
+    # TODO: Either call the above function in regressions or put an entry point here.