Skip to content

Commit

Permalink
Merge branch 'main' into shashank/new_preprocessor
Browse files Browse the repository at this point in the history
  • Loading branch information
ShashankMosaicML authored Dec 16, 2024
2 parents a0b08a4 + c494017 commit e609ed9
Show file tree
Hide file tree
Showing 16 changed files with 256 additions and 35 deletions.
1 change: 1 addition & 0 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ def build_finetuning_dataloader(
replication_factor if replication_factor > 1 else None,
rank=dist.get_global_rank() //
replication_factor if replication_factor > 1 else None,
seed=dataset_cfg.get('shuffle_seed', 0),
)

assert streaming_dataset is not None # for pyright
Expand Down
7 changes: 5 additions & 2 deletions llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,9 @@ def profile_packing(

# If streaming dataset, use a temporary local folder for profiling
local_rank_zero = dist.get_global_rank() - dist.get_local_rank()
if dataset_cfg.get('remote') is not None:
if dataset_cfg.get(
'remote',
) is not None and dataset_cfg.get('local') is None:
tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
tmp_path = gathered_paths[local_rank_zero]
Expand All @@ -485,7 +487,8 @@ def profile_packing(
tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
tmp_path = gathered_paths[local_rank_zero]
stream_config['local'] = tmp_path
if stream_config.get('local') is None:
stream_config['local'] = tmp_path

# Determine the packing_ratio values we'll try
packing_ratios, raw_batch_sizes = [], []
Expand Down
4 changes: 3 additions & 1 deletion llmfoundry/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import warnings
from dataclasses import dataclass, fields
from pathlib import Path
from typing import (
Any,
Callable,
Expand Down Expand Up @@ -703,6 +704,8 @@ def _process_data_source(
true_split (str): The split of the dataset to be added (i.e. train or eval)
data_paths (List[Tuple[str, str, str]]): A list of tuples formatted as (data type, path, split)
"""
if source_dataset_path:
source_dataset_path = str(Path(source_dataset_path))
# Check for Delta table
if source_dataset_path and len(source_dataset_path.split('.')) == 3:
data_paths.append(('delta_table', source_dataset_path, true_split))
Expand Down Expand Up @@ -788,7 +791,6 @@ def log_dataset_uri(cfg: dict[str, Any]) -> None:

# Map data source types to their respective MLFlow DataSource.
for dataset_type, path, split in data_paths:

if dataset_type in dataset_source_mapping:
source_class = dataset_source_mapping[dataset_type]
if dataset_type == 'delta_table':
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b-max-seq-len-8k.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-benchmark-mpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.5.1_cu124-latest
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]

Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-convert-composer-to-hf.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-hf-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-hf-generate.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
22 changes: 13 additions & 9 deletions mcli/mcli-llama2-finetune.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo

command: |
cd llm-foundry/scripts
export HF_HUB_ENABLE_HF_TRANSFER=1
composer train/train.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.5.1_cu124-latest
name: llama2-finetune
Expand All @@ -21,9 +22,12 @@ compute:

# The below is injected as a YAML file: /mnt/config/parameters.yaml
parameters:
tokenizer_name: meta-llama/Llama-2-7b-hf
max_seq_len: 4096
global_seed: 17
variables:
tokenizer_name: meta-llama/Llama-2-7b-hf
global_seed: 17
max_seq_len: 4096

max_seq_len: ${variables.max_seq_len}

# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME
Expand All @@ -42,17 +46,17 @@ parameters:

# Tokenizer
tokenizer:
name: ${tokenizer_name}
name: ${variables.tokenizer_name}
kwargs:
model_max_length: ${max_seq_len}
model_max_length: ${variables.max_seq_len}

# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: mosaicml/dolly_hhrlhf
split: train
max_seq_len: ${max_seq_len}
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
shuffle: true
Expand All @@ -75,7 +79,7 @@ parameters:
dataset:
hf_name: mosaicml/dolly_hhrlhf
split: test
max_seq_len: ${max_seq_len}
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
# packing_ratio:
Expand Down Expand Up @@ -114,7 +118,7 @@ parameters:
global_train_batch_size: 64

# System
seed: ${global_seed}
seed: ${variables.global_seed}
device_eval_batch_size: 8
device_train_microbatch_size: auto
precision: amp_bf16
Expand Down
160 changes: 160 additions & 0 deletions mcli/mcli-llama3-70b-instruct-finetune.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo

command: |
cd llm-foundry/scripts
export HF_HUB_ENABLE_HF_TRANSFER=1
composer train/train.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.5.1_cu124-latest
name: llama3.1-70b-finetune

compute:
# Note: Finetuning the 70b model requires at least 16x80GB GPUs
gpus: 16 # Number of GPUs to use
## These configurations are optional
# cluster: TODO # Name of the cluster to use for this run
# gpu_type: h100_80gb # Type of GPU to use. We use h100_80gb in our experiments

# The below is injected as a YAML file: /mnt/config/parameters.yaml
parameters:
variables:
tokenizer_name: meta-llama/Llama-3.1-70B-Instruct
global_seed: 17
max_seq_len: 4096

max_seq_len: ${variables.max_seq_len}
# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME

max_split_size_mb: 512
dist_timeout: 3600 # set to avoid NCCL timeouts

# Model
model:
name: hf_causal_lm
init_device: mixed
pretrained_model_name_or_path: meta-llama/Llama-3.1-70B-Instruct
pretrained: true
# Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
use_auth_token: true
use_flash_attention_2: true

# Tokenizer
tokenizer:
name: ${variables.tokenizer_name}
kwargs:
model_max_length: ${variables.max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: mosaicml/dolly_hhrlhf
split: train
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
shuffle: true
# # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
# # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
# # of the dataset.
# # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
# # to profile this run's optimal packing_ratio as it depends on GPU count,
# # batch size, sequence length
# packing_ratio: auto
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0

eval_loader:
name: finetuning
dataset:
hf_name: mosaicml/dolly_hhrlhf
split: test
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
# packing_ratio:
shuffle: false
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0

# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1

# Note: You may want to change learning rate, betas, weight decay
optimizer:
name: decoupled_lionw
lr: 5.0e-7
betas:
- 0.9
- 0.95
weight_decay: 0.0

algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0

max_duration: 1ep
eval_first: false
eval_interval: 1ep
eval_subset_num_batches: -1
global_train_batch_size: 16

# System
seed: ${variables.global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
precision: amp_bf16

# FSDP
fsdp_config:
state_dict_type: sharded # Note: we enable sharded checkpointing to avoid GPU OOM
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}

load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc

# loggers:
# wandb: {}

# Checkpoint to local filesystem or remote object store
# save_interval: 2000ba
# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
# save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints

# Load from local filesystem or remote object store
# load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt
# load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt
2 changes: 1 addition & 1 deletion mcli/mcli-openai-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .[gpu,openai]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-pretokenize-oci-upload.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ integrations:
- oci-cli==3.23.2
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.0
git_branch: v0.15.1
# git_commit: # OR use your commit hash
pip_install: .
ssh_clone: false # Should be true if using a private repo
Expand Down
Loading

0 comments on commit e609ed9

Please sign in to comment.