Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add finutuning with streaming dataset example #945

Merged
merged 27 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ on:
branches:
- main
- release/**
# todo: remove this before merging
- add_finetuning_streaming_dataset_conversion
bigning marked this conversation as resolved.
Show resolved Hide resolved
bigning marked this conversation as resolved.
Show resolved Hide resolved
pull_request:
branches:
- main
- release/**
- add_finetuning_streaming_dataset_conversion
bigning marked this conversation as resolved.
Show resolved Hide resolved
workflow_call:
workflow_dispatch:
# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/data/finetuning/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def __init__(

def __call__(self, examples: List[Dict[str,
Any]]) -> Dict[str, torch.Tensor]:
for check_key in ['input_ids', 'labels', 'attention_mask']:
for check_key in ['input_ids', 'labels']:
dakinggg marked this conversation as resolved.
Show resolved Hide resolved
if check_key not in examples[0]:
raise KeyError(
f'Examples returned by dataset do not include required key: {check_key}'
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
sampling_method=cfg.dataset.get('sampling_method', 'balanced'),
sampling_granularity=cfg.dataset.get('sampling_granularity', 1),
batching_method=cfg.dataset.get('batching_method', 'random'),
max_seq_len=cfg.dataset.max_seq_len,
dakinggg marked this conversation as resolved.
Show resolved Hide resolved
)

else:
Expand Down
11 changes: 11 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:

import datasets as hf_datasets
import huggingface_hub as hf_hub
import numpy as np
from composer.utils import dist
from streaming import StreamingDataset
from transformers import PreTrainedTokenizerBase
Expand Down Expand Up @@ -332,6 +333,7 @@ def __init__(self,
sampling_method: str = 'balanced',
sampling_granularity: int = 1,
batching_method: str = 'random',
max_seq_len: int = 2048,
**kwargs: Any):

if len(kwargs) > 0:
Expand Down Expand Up @@ -371,10 +373,19 @@ def __init__(self,
)

self.tokenizer = tokenizer
self.max_seq_len = max_seq_len

# How to process a sample
def __getitem__(self, idx: int) -> Dict[str, Any]:
sample = super().__getitem__(idx)
if 'input_ids' in sample:
# already tokenized data
bigning marked this conversation as resolved.
Show resolved Hide resolved
sample['input_ids'] = np.frombuffer(
bigning marked this conversation as resolved.
Show resolved Hide resolved
sample['input_ids'],
dtype=np.int64)[:self.max_seq_len].tolist().copy()
bigning marked this conversation as resolved.
Show resolved Hide resolved
sample['labels'] = np.frombuffer(sample['labels'],
dtype=np.int64).tolist().copy()
return sample
return tokenize_formatted_example(sample, tokenizer=self.tokenizer)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
max_seq_len: 512
bigning marked this conversation as resolved.
Show resolved Hide resolved
global_seed: 17

# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME

# Model
model:
name: hf_causal_lm
pretrained_model_name_or_path: gpt2
pretrained: true # false: only use the architecture; true: initialize with pretrained weights

# Tokenizer
tokenizer:
name: gpt2
kwargs:
model_max_length: ${max_seq_len}

# Dataloaders
train_loader:
name: finetuning
dataset:
############
remote: /Users/ning.wang/projects/llm-foundry/scripts/data_prep/example_data/
bigning marked this conversation as resolved.
Show resolved Hide resolved
local: /tmp/
split: train
############
shuffle: true
max_seq_len: ${max_seq_len}
decoder_only_format: true
drop_last: true
num_workers: 8

# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1

optimizer:
name: decoupled_adamw
lr: 6.0e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0

algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0

max_duration: 1ep
eval_interval: 1
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 8

# System
seed: ${global_seed}
device_eval_batch_size: 8
device_train_microbatch_size: 8
# device_train_microbatch_size: auto
precision: fp32

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}
Loading