-
Notifications
You must be signed in to change notification settings - Fork 538
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
193 additions
and
0 deletions.
There are no files selected for viewing
119 changes: 119 additions & 0 deletions
119
.github/workflows/regression_yamls/mpt-125m-elastic-resumption.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
integrations: | ||
- integration_type: git_repo | ||
git_repo: mosaicml/llm-foundry | ||
git_branch: main | ||
pip_install: -e .[gpu] | ||
|
||
command: | | ||
cd llm-foundry/scripts | ||
python data_prep/convert_dataset_hf.py \ | ||
--dataset c4 --data_subset en \ | ||
--out_root ./my-copy-c4 --splits train_small val_small \ | ||
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' | ||
# Composer command will be inserted below: | ||
image: mosaicml/llm-foundry:2.0.1_cu118-latest | ||
name: mpt-125-elastic-resumption | ||
|
||
compute: | ||
gpus: <INPUT> | ||
|
||
parameters: | ||
data_local: ./my-copy-c4 | ||
data_remote: | ||
max_seq_len: 2048 | ||
global_seed: 17 | ||
|
||
model: | ||
name: mpt_causal_lm | ||
init_device: meta | ||
d_model: 768 | ||
n_heads: 12 | ||
n_layers: 12 | ||
expansion_ratio: 4 | ||
max_seq_len: ${max_seq_len} | ||
vocab_size: 50368 | ||
attn_config: | ||
attn_impl: triton | ||
|
||
tokenizer: | ||
name: EleutherAI/gpt-neox-20b | ||
kwargs: | ||
model_max_length: ${max_seq_len} | ||
|
||
train_loader: | ||
name: text | ||
dataset: | ||
local: ${data_local} | ||
remote: ${data_remote} | ||
split: train_small | ||
shuffle: true | ||
max_seq_len: ${max_seq_len} | ||
shuffle_seed: ${global_seed} | ||
drop_last: true | ||
num_workers: 8 | ||
|
||
eval_loader: | ||
name: text | ||
dataset: | ||
local: ${data_local} | ||
remote: ${data_remote} | ||
split: val_small | ||
shuffle: false | ||
max_seq_len: ${max_seq_len} | ||
shuffle_seed: ${global_seed} | ||
drop_last: false | ||
num_workers: 8 | ||
|
||
scheduler: | ||
name: cosine_with_warmup | ||
t_warmup: 100ba | ||
alpha_f: 0.1 | ||
|
||
optimizer: | ||
name: decoupled_adamw | ||
lr: 6.0e-4 | ||
betas: | ||
- 0.9 | ||
- 0.95 | ||
eps: 1.0e-08 | ||
weight_decay: 0.0 | ||
|
||
algorithms: | ||
gradient_clipping: | ||
clipping_type: norm | ||
clipping_threshold: 1.0 | ||
|
||
max_duration: <INPUT> | ||
eval_interval: 500ba | ||
eval_first: false | ||
eval_subset_num_batches: -1 | ||
global_train_batch_size: 256 | ||
|
||
seed: ${global_seed} | ||
device_eval_batch_size: 16 | ||
device_train_microbatch_size: auto | ||
precision: amp_bf16 | ||
|
||
fsdp_config: | ||
sharding_strategy: FULL_SHARD | ||
mixed_precision: PURE | ||
activation_checkpointing: false | ||
activation_checkpointing_reentrant: false | ||
activation_cpu_offload: false | ||
limit_all_gathers: true | ||
verbose: false | ||
state_dict_type: sharded | ||
|
||
progress_bar: false | ||
log_to_console: true | ||
console_log_interval: 1ba | ||
|
||
callbacks: | ||
speed_monitor: | ||
window_size: 10 | ||
lr_monitor: {} | ||
memory_monitor: {} | ||
runtime_estimator: {} | ||
|
||
save_interval: 10ba |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# Copyright 2022 MosaicML LLM Foundry authors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import datetime | ||
import os | ||
import subprocess | ||
import logging | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
DIR_PATH = os.path.dirname(os.path.abspath(__file__)) | ||
REGRESSIONS_DIR = os.path.join(DIR_PATH, 'regression_yamls') | ||
|
||
from mcli import RunConfig, RunStatus, create_run, wait_for_run_status | ||
|
||
def test_elastic_resumption(cluster: str, save_folder: str, wandb_entity: str, | ||
wandb_project: str, git_repo: str, git_branch: str): | ||
def create_run_and_wait(gpus: int, resume: bool, subdir: str): | ||
config = RunConfig.from_file( | ||
os.path.join(REGRESSIONS_DIR, 'mpt-125m-elastic-resumption.yaml')) | ||
|
||
# Add the command to train our model | ||
composer_command = '\ncomposer train/train.py /mnt/config/parameters.yaml' | ||
if resume: | ||
composer_command += ' autoresume=true' # TODO: autoresume and save_overwrite can't both be true, but i have to overwrite if i run multiple runs with same save folder | ||
else: | ||
composer_command += ' save_overwrite=true autoresume=false' | ||
config.command += composer_command | ||
|
||
# Add suffix to name | ||
name_suffix = f'-{gpus}' | ||
if resume: | ||
name_suffix += '-resume' | ||
config.name += name_suffix | ||
|
||
# Set other parameters | ||
config.cluster = cluster | ||
config.compute['gpus'] = gpus | ||
config.parameters['save_folder'] = os.path.join(save_folder, subdir) | ||
config.parameters['max_duration'] = '20ba' if resume else '10ba' | ||
commit_hash = subprocess.check_output(['git', 'rev-parse', | ||
'HEAD']).strip().decode('utf-8') | ||
timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S') | ||
wandb_group = f'{timestamp}::{commit_hash}' | ||
wandb_config = { | ||
'entity': wandb_entity, | ||
'project': wandb_project, | ||
'group': wandb_group | ||
} | ||
config.parameters['loggers'] = config.parameters.get('loggers', {}) | ||
config.parameters['loggers']['wandb'] = wandb_config | ||
config.integrations[0]['git_repo'] = git_repo | ||
config.integrations[0]['git_branch'] = git_branch | ||
|
||
# Start run | ||
run = create_run(config) | ||
log.info(f'Starting run {run.name}') | ||
wait_for_run_status(run, RunStatus.COMPLETED) # Wait for the run to complete or terminate. | ||
if run.status != RunStatus.COMPLETED: | ||
raise Exception(f'Failure on run {run.name}. Run status is {run.status}. Terminating test.') | ||
log.info(f'Completed run {run.name}') | ||
|
||
# Test 1 node => 2 node elastic resumption | ||
subdir = '1_to_2_node' | ||
create_run_and_wait(gpus=8, resume=False, subdir=subdir) | ||
create_run_and_wait(gpus=16, resume=True, subdir=subdir) | ||
|
||
# Test 2 node => 1 node elastic resumption | ||
subdir = '2_to_1_node' | ||
create_run_and_wait(gpus=16, resume=False, subdir=subdir) | ||
create_run_and_wait(gpus=8, resume=True, subdir=subdir) | ||
|
||
if __name__ == '__main__': | ||
# TODO: Either call the above function in regressions or put an entry point here. |