Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea committed Sep 18, 2023
1 parent 7ec2fe0 commit 79f62f4
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 0 deletions.
119 changes: 119 additions & 0 deletions .github/workflows/regression_yamls/mpt-125m-elastic-resumption.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: main
pip_install: -e .[gpu]

command: |
cd llm-foundry/scripts
python data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--out_root ./my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
# Composer command will be inserted below:
image: mosaicml/llm-foundry:2.0.1_cu118-latest
name: mpt-125-elastic-resumption

compute:
gpus: <INPUT>

parameters:
data_local: ./my-copy-c4
data_remote:
max_seq_len: 2048
global_seed: 17

model:
name: mpt_causal_lm
init_device: meta
d_model: 768
n_heads: 12
n_layers: 12
expansion_ratio: 4
max_seq_len: ${max_seq_len}
vocab_size: 50368
attn_config:
attn_impl: triton

tokenizer:
name: EleutherAI/gpt-neox-20b
kwargs:
model_max_length: ${max_seq_len}

train_loader:
name: text
dataset:
local: ${data_local}
remote: ${data_remote}
split: train_small
shuffle: true
max_seq_len: ${max_seq_len}
shuffle_seed: ${global_seed}
drop_last: true
num_workers: 8

eval_loader:
name: text
dataset:
local: ${data_local}
remote: ${data_remote}
split: val_small
shuffle: false
max_seq_len: ${max_seq_len}
shuffle_seed: ${global_seed}
drop_last: false
num_workers: 8

scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1

optimizer:
name: decoupled_adamw
lr: 6.0e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0

algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0

max_duration: <INPUT>
eval_interval: 500ba
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 256

seed: ${global_seed}
device_eval_batch_size: 16
device_train_microbatch_size: auto
precision: amp_bf16

fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: false
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true
verbose: false
state_dict_type: sharded

progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}

save_interval: 10ba
74 changes: 74 additions & 0 deletions .github/workflows/test_elastic_resumption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import datetime
import os
import subprocess
import logging

log = logging.getLogger(__name__)

DIR_PATH = os.path.dirname(os.path.abspath(__file__))
REGRESSIONS_DIR = os.path.join(DIR_PATH, 'regression_yamls')

from mcli import RunConfig, RunStatus, create_run, wait_for_run_status

def test_elastic_resumption(cluster: str, save_folder: str, wandb_entity: str,
wandb_project: str, git_repo: str, git_branch: str):
def create_run_and_wait(gpus: int, resume: bool, subdir: str):
config = RunConfig.from_file(
os.path.join(REGRESSIONS_DIR, 'mpt-125m-elastic-resumption.yaml'))

# Add the command to train our model
composer_command = '\ncomposer train/train.py /mnt/config/parameters.yaml'
if resume:
composer_command += ' autoresume=true' # TODO: autoresume and save_overwrite can't both be true, but i have to overwrite if i run multiple runs with same save folder
else:
composer_command += ' save_overwrite=true autoresume=false'
config.command += composer_command

# Add suffix to name
name_suffix = f'-{gpus}'
if resume:
name_suffix += '-resume'
config.name += name_suffix

# Set other parameters
config.cluster = cluster
config.compute['gpus'] = gpus
config.parameters['save_folder'] = os.path.join(save_folder, subdir)
config.parameters['max_duration'] = '20ba' if resume else '10ba'
commit_hash = subprocess.check_output(['git', 'rev-parse',
'HEAD']).strip().decode('utf-8')
timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S')
wandb_group = f'{timestamp}::{commit_hash}'
wandb_config = {
'entity': wandb_entity,
'project': wandb_project,
'group': wandb_group
}
config.parameters['loggers'] = config.parameters.get('loggers', {})
config.parameters['loggers']['wandb'] = wandb_config
config.integrations[0]['git_repo'] = git_repo
config.integrations[0]['git_branch'] = git_branch

# Start run
run = create_run(config)
log.info(f'Starting run {run.name}')
wait_for_run_status(run, RunStatus.COMPLETED) # Wait for the run to complete or terminate.
if run.status != RunStatus.COMPLETED:
raise Exception(f'Failure on run {run.name}. Run status is {run.status}. Terminating test.')
log.info(f'Completed run {run.name}')

# Test 1 node => 2 node elastic resumption
subdir = '1_to_2_node'
create_run_and_wait(gpus=8, resume=False, subdir=subdir)
create_run_and_wait(gpus=16, resume=True, subdir=subdir)

# Test 2 node => 1 node elastic resumption
subdir = '2_to_1_node'
create_run_and_wait(gpus=16, resume=False, subdir=subdir)
create_run_and_wait(gpus=8, resume=True, subdir=subdir)

if __name__ == '__main__':
# TODO: Either call the above function in regressions or put an entry point here.

0 comments on commit 79f62f4

Please sign in to comment.