Skip to content

Commit

Permalink
Merge pull request #4 from mosaicml/main
Browse files Browse the repository at this point in the history
merging from main
  • Loading branch information
ShashankMosaicML authored Oct 20, 2023
2 parents 5981ade + b2a43a1 commit b179b07
Show file tree
Hide file tree
Showing 7 changed files with 675 additions and 349 deletions.
18 changes: 17 additions & 1 deletion scripts/misc/update_hub_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,24 @@
from llmfoundry.utils.huggingface_hub_utils import \
edit_files_for_hf_compatibility

_ALL_MODELS = [
'mosaicml/mpt-7b',
'mosaicml/mpt-7b-instruct',
'mosaicml/mpt-7b-chat',
'mosaicml/mpt-30b',
'mosaicml/mpt-30b-chat',
'mosaicml/mpt-30b-instruct',
'mosaicml/mpt-7b-8k',
'mosaicml/mpt-7b-8k-instruct',
'mosaicml/mpt-7b-8k-chat',
'mosaicml/mpt-7b-storywriter',
]


def main(hf_repos_for_upload: List[str]):
if len(hf_repos_for_upload) == 1 and hf_repos_for_upload[0] == 'all':
hf_repos_for_upload = _ALL_MODELS

current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime('%B %d, %Y %H:%M:%S')

Expand Down Expand Up @@ -61,7 +77,7 @@ def main(hf_repos_for_upload: List[str]):
create_pr=True,
)

print(f'PR opened: {result}')
print(f'PR opened: {result}\n')


if __name__ == '__main__':
Expand Down
374 changes: 208 additions & 166 deletions scripts/train/benchmarking/README.md

Large diffs are not rendered by default.

78 changes: 46 additions & 32 deletions scripts/train/benchmarking/collect_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import math
from typing import Any, Dict, List, Union

from mcli import sdk as msdk
from composer.callbacks.speed_monitor import \
GPU_AVAILABLE_FLOPS as GPU_FLOP_DICT

GPU_AVAILABLE_FLOPS = 312_000_000_000_000
from mcli import sdk as msdk


def str_to_bool(value: Union[bool, str]):
Expand Down Expand Up @@ -46,13 +47,19 @@ def parse_args():


def get_runs(args: argparse.Namespace):
runs = [r for r in msdk.get_runs() if args.project in r.name]
runs = [
r for r in msdk.get_runs(include_details=True)
if args.project in r.name.split('-')[0] and
r.status == msdk.RunStatus('COMPLETED')
]
for filter in args.filters:
runs = [r for r in runs if filter in r.name]

def sort_key(r: msdk.Run):
model_name = r.name.split('-')[2]
num_gpu = r.config.gpu_num
num_gpu = r.gpus
gpu_type = r.gpu_type
model_precision = r.submitted_config.parameters['precision']
if model_name[-1] == 'm':
model_name_size = 1e6
elif model_name[-1] == 'b':
Expand All @@ -61,9 +68,12 @@ def sort_key(r: msdk.Run):
print(model_name)
raise ValueError
model_size = int(model_name[:-1])
return (model_name_size, model_size, r.config.parameters['max_seq_len'],
num_gpu, r.config.parameters['global_train_batch_size'])
return (gpu_type, model_precision, model_name_size, model_size,
r.submitted_config.parameters['max_seq_len'], num_gpu,
r.submitted_config.parameters['global_train_batch_size'])

unique_runs = {sort_key(i): i for i in runs}
runs = [unique_runs[r] for r in unique_runs]
runs.sort(reverse=True, key=sort_key)

return runs
Expand All @@ -83,17 +93,7 @@ def filter_runs(runs: List[msdk.Run]):

pop_runs = []
for run in runs:
if run.status in [
msdk.RunStatus('FAILED_PULL'),
msdk.RunStatus('PENDING'),
msdk.RunStatus('QUEUED'),
msdk.RunStatus('RUNNING'),
msdk.RunStatus('SCHEDULED'),
msdk.RunStatus('STARTING'),
msdk.RunStatus('STOPPED'),
msdk.RunStatus('STOPPING'),
msdk.RunStatus('TERMINATING'),
]:
if run.status != msdk.RunStatus('COMPLETED'):
print(f'run {run.name} has run status {run.status}')
pop_runs.append(run)
for run in pop_runs:
Expand All @@ -106,13 +106,22 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
n_params = micro_batchsize = throughput = -1

model_name = run.name.split('-')[2]
gpu_num = run.config.gpu_num
gpu_type = run.config.gpu_type

fsdp_config = run.config.parameters['fsdp_config']

seq_len = run.config.parameters['max_seq_len']
global_train_batch_size = run.config.parameters['global_train_batch_size']
gpus = run.gpus
gpu_type = run.gpu_type

if 'h100' in gpu_type:
gpu_type = 'h100-sxm'
if 'a100' in gpu_type:
gpu_type = 'a100'
GPU_AVAILABLE_FLOPS = GPU_FLOP_DICT[gpu_type][
run.submitted_config.parameters['precision']]

gpu_type = run.gpu_type
fsdp_config = run.submitted_config.parameters['fsdp_config']

seq_len = run.submitted_config.parameters['max_seq_len']
global_train_batch_size = run.submitted_config.parameters[
'global_train_batch_size']
activation_checkpointing = fsdp_config['activation_checkpointing']

logs = msdk.get_run_logs(run)
Expand All @@ -138,8 +147,8 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
throughput = float(line.split(' ')[-1])
break

d_model = run.config.parameters['model']['d_model']
n_layers = run.config.parameters['model']['n_layers']
d_model = run.submitted_config.parameters['model']['d_model']
n_layers = run.submitted_config.parameters['model']['n_layers']

# mfu is approximated using thoughtput and param count
# the number of paramters is approximately the number of multiply-accumulates (MAC) in the network
Expand All @@ -153,43 +162,48 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
attn_flops_per_seq = n_layers * 2 * 2 * (d_model * (seq_len**2))
# there are 2 ops in bwd pass and 1 in fwd pass so we mult by 3
mfu_w_attn = (3 * flops_per_seq + 3 * attn_flops_per_seq) * throughput / (
gpu_num * GPU_AVAILABLE_FLOPS)
gpus * GPU_AVAILABLE_FLOPS)

if activation_checkpointing:
hfu_w_attn = (4 * flops_per_seq + 4 * attn_flops_per_seq
) * throughput / (gpu_num * GPU_AVAILABLE_FLOPS)
) * throughput / (gpus * GPU_AVAILABLE_FLOPS)
else:
hfu_w_attn = mfu_w_attn

model_tflop = int(
(3 * flops_per_seq + 3 * attn_flops_per_seq) * throughput / gpus / 1e12)

return {
'Model':
model_name,
'SeqLen (T)':
seq_len,
'# GPUs':
gpu_num,
gpus,
'GPU':
gpu_type,
'MFU':
round(mfu_w_attn * 100, 2),
'HFU':
round(hfu_w_attn * 100, 2),
'Model TFLOP':
model_tflop,
'MicroBatchSize':
micro_batchsize,
'GradAccum':
math.ceil(global_train_batch_size / gpu_num / micro_batchsize),
math.ceil(global_train_batch_size / gpus / micro_batchsize),
'GlobalBatchSize':
global_train_batch_size,
'Throughput (S/s)':
int(throughput),
'Throughput (T/s)':
int(throughput * seq_len),
'Throughput (T/s/GPU)':
int(throughput * seq_len / gpu_num),
int(throughput * seq_len / gpus),
'GlobalBatchSize (T)':
global_train_batch_size * seq_len,
'Precision':
run.config.parameters['precision'],
run.submitted_config.parameters['precision'],
'MP Mode':
fsdp_config['mixed_precision'],
'Sharding Strategy':
Expand Down
Loading

0 comments on commit b179b07

Please sign in to comment.