-
Notifications
You must be signed in to change notification settings - Fork 536
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into anna/license-updates
- Loading branch information
Showing
3 changed files
with
121 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -376,7 +376,7 @@ def get_integrations(project: str, | |
git_integration.update({ | ||
'integration_type': 'git_repo', | ||
'git_repo': 'mosaicml/llm-foundry', | ||
'pip_install': '-e .[gpu]' | ||
'pip_install': '.[gpu-flash2]' | ||
}) | ||
|
||
integrations = [git_integration] | ||
|
@@ -398,8 +398,8 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], | |
{ | ||
'integration_type': 'git_repo', | ||
'git_repo': 'mosaicml/llm-foundry', | ||
'git_branch': 'v0.4.0', | ||
'pip_install': '-e .[gpu]', | ||
'git_branch': 'main', | ||
'pip_install': '.[gpu-flash2]', | ||
}, | ||
{ | ||
'integration_type': 'wandb', | ||
|
@@ -411,7 +411,7 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], | |
command = '' | ||
if gpu_type == 'h100_80gb' and 'fp8' in precision: # Required for flash-attn and FP8 training | ||
command += f""" | ||
pip install flash-attn==1.0.7 --no-build-isolation | ||
pip install flash-attn==2.4.2 --no-build-isolation | ||
pip install git+https://github.com/NVIDIA/[email protected] | ||
pip uninstall install pydantic --yes | ||
pip install pydantic==1.9.0 | ||
|
@@ -420,11 +420,11 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], | |
if args.data_remote is None: | ||
command += f""" | ||
cd llm-foundry/scripts | ||
python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --tokenizer gpt2 --eos_text '<|endoftext|>' | ||
python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>' | ||
composer train/train.py /mnt/config/parameters.yaml | ||
""" | ||
else: | ||
command = f""" | ||
command += f""" | ||
cd llm-foundry/scripts | ||
composer train/train.py /mnt/config/parameters.yaml | ||
""" | ||
|
@@ -487,6 +487,7 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], | |
print(f'Launching run {run.name}') | ||
else: | ||
print(f'run = {name}') | ||
print(f'{config=}') | ||
|
||
|
||
def run_check_capacity(model_yaml: str, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# Copyright 2022 MosaicML LLM Foundry authors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
|
||
# Define the arguments to sweep over | ||
|
||
base_args = [ | ||
'--project tput', | ||
'--image <insert_image_name>', | ||
'--git_branch main', | ||
'--precisions bf16', | ||
'--fsdp_config_mixed_precision PURE', | ||
'--fsdp_config_limit_all_gathers true', | ||
'--fsdp_config_forward_prefetch true', | ||
'--fsdp_config_backward_prefetch BACKWARD_PRE', | ||
'--activation_cpu_offload false', | ||
'--seq_len_exp 11 11', | ||
'--accum 1', | ||
'--clusters <insert_cluster_name>', | ||
'--gpu_types h100_80gb', | ||
'--data_remote <insert_data_remote_location>', | ||
'--wandb true', | ||
'--priority lowest', | ||
'--RUN true', | ||
] | ||
|
||
num_gpu_args_list = [ | ||
[ | ||
'--gpu_nums 128', | ||
], | ||
[ | ||
'--gpu_nums 256', | ||
], | ||
[ | ||
'--gpu_nums 512', | ||
], | ||
] | ||
|
||
model_args_list = [ | ||
[ | ||
'--model_yamls 1b.yaml', | ||
'--fsdp_config_activation_checkpointing false', | ||
'--fsdp_config_shard_strategy SHARD_GRAD_OP', | ||
'--microbatch_size 12', | ||
'--attn_impl flash', | ||
], | ||
[ | ||
'--model_yamls 3b.yaml', | ||
'--fsdp_config_activation_checkpointing false', | ||
'--fsdp_config_shard_strategy SHARD_GRAD_OP', | ||
'--microbatch_size 8', | ||
'--attn_impl flash', | ||
], | ||
[ | ||
'--model_yamls 7b.yaml', | ||
'--fsdp_config_activation_checkpointing false', | ||
'--fsdp_config_shard_strategy FULL_SHARD', | ||
'--microbatch_size 6', | ||
'--attn_impl flash', | ||
], | ||
[ | ||
'--model_yamls 13b.yaml', | ||
'--fsdp_config_activation_checkpointing true', | ||
'--fsdp_config_shard_strategy FULL_SHARD', | ||
'--microbatch_size 16', | ||
'--attn_impl triton', | ||
], | ||
[ | ||
'--model_yamls 30b.yaml', | ||
'--fsdp_config_activation_checkpointing true', | ||
'--fsdp_config_shard_strategy FULL_SHARD', | ||
'--microbatch_size 8', | ||
'--attn_impl triton', | ||
], | ||
[ | ||
'--model_yamls 70b.yaml', | ||
'--fsdp_config_activation_checkpointing true', | ||
'--fsdp_config_shard_strategy FULL_SHARD', | ||
'--microbatch_size 8', | ||
'--attn_impl flash', | ||
], | ||
] | ||
|
||
# Iterate over the arguments and call submit_benchmarks.py | ||
for num_gpu_args in num_gpu_args_list: | ||
for model_args in model_args_list: | ||
command = ['python submit_benchmarks.py' | ||
] + base_args + num_gpu_args + model_args | ||
command = ' '.join(command) | ||
os.system(command) |