diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 0808677318..22f9d1bd2b 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -337,6 +337,7 @@ def build_finetuning_dataloader( replication_factor if replication_factor > 1 else None, rank=dist.get_global_rank() // replication_factor if replication_factor > 1 else None, + seed=dataset_cfg.get('shuffle_seed', 0), ) assert streaming_dataset is not None # for pyright diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index d5bdd5735f..6fed96a13d 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -474,8 +474,9 @@ def profile_packing( # If streaming dataset, use a temporary local folder for profiling local_rank_zero = dist.get_global_rank() - dist.get_local_rank() - if dataset_cfg.get('remote' - ) is not None and dataset_cfg.get('local') is None: + if dataset_cfg.get( + 'remote', + ) is not None and dataset_cfg.get('local') is None: tmp_path_to_broadcast = tempfile.TemporaryDirectory().name gathered_paths = dist.all_gather_object(tmp_path_to_broadcast) tmp_path = gathered_paths[local_rank_zero] diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 8f3b6bac4e..24586c317d 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -8,6 +8,7 @@ integrations: command: | cd llm-foundry/scripts + export HF_HUB_ENABLE_HF_TRANSFER=1 composer train/train.py /mnt/config/parameters.yaml image: mosaicml/llm-foundry:2.5.1_cu124-latest name: llama2-finetune @@ -21,9 +22,12 @@ compute: # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: - tokenizer_name: meta-llama/Llama-2-7b-hf - max_seq_len: 4096 - global_seed: 17 + variables: + tokenizer_name: meta-llama/Llama-2-7b-hf + global_seed: 17 + max_seq_len: 4096 + + max_seq_len: ${variables.max_seq_len} # Run Name run_name: # If left blank, will be read from env var $RUN_NAME @@ -42,9 +46,9 @@ parameters: # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: @@ -52,7 +56,7 @@ parameters: dataset: hf_name: mosaicml/dolly_hhrlhf split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true shuffle: true @@ -75,7 +79,7 @@ parameters: dataset: hf_name: mosaicml/dolly_hhrlhf split: test - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true # packing_ratio: @@ -114,7 +118,7 @@ parameters: global_train_batch_size: 64 # System - seed: ${global_seed} + seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: auto precision: amp_bf16 diff --git a/mcli/mcli-llama3-70b-instruct-finetune.yaml b/mcli/mcli-llama3-70b-instruct-finetune.yaml new file mode 100644 index 0000000000..10c040808d --- /dev/null +++ b/mcli/mcli-llama3-70b-instruct-finetune.yaml @@ -0,0 +1,160 @@ +integrations: +- integration_type: git_repo + git_repo: mosaicml/llm-foundry + git_branch: v0.15.0 + # git_commit: # OR use your commit hash + pip_install: .[gpu] + ssh_clone: false # Should be true if using a private repo + +command: | + cd llm-foundry/scripts + export HF_HUB_ENABLE_HF_TRANSFER=1 + composer train/train.py /mnt/config/parameters.yaml +image: mosaicml/llm-foundry:2.5.1_cu124-latest +name: llama3.1-70b-finetune + +compute: + # Note: Finetuning the 70b model requires at least 16x80GB GPUs + gpus: 16 # Number of GPUs to use + ## These configurations are optional + # cluster: TODO # Name of the cluster to use for this run + # gpu_type: h100_80gb # Type of GPU to use. We use h100_80gb in our experiments + +# The below is injected as a YAML file: /mnt/config/parameters.yaml +parameters: + variables: + tokenizer_name: meta-llama/Llama-3.1-70B-Instruct + global_seed: 17 + max_seq_len: 4096 + + max_seq_len: ${variables.max_seq_len} + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + + max_split_size_mb: 512 + dist_timeout: 3600 # set to avoid NCCL timeouts + + # Model + model: + name: hf_causal_lm + init_device: mixed + pretrained_model_name_or_path: meta-llama/Llama-3.1-70B-Instruct + pretrained: true + # Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models + use_auth_token: true + use_flash_attention_2: true + + # Tokenizer + tokenizer: + name: ${variables.tokenizer_name} + kwargs: + model_max_length: ${variables.max_seq_len} + # Dataloaders + train_loader: + name: finetuning + dataset: + hf_name: mosaicml/dolly_hhrlhf + split: train + max_seq_len: ${variables.max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + shuffle: true + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # to profile this run's optimal packing_ratio as it depends on GPU count, + # # batch size, sequence length + # packing_ratio: auto + drop_last: true + num_workers: 8 + pin_memory: false + prefetch_factor: 2 + persistent_workers: true + timeout: 0 + + eval_loader: + name: finetuning + dataset: + hf_name: mosaicml/dolly_hhrlhf + split: test + max_seq_len: ${variables.max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + # packing_ratio: + shuffle: false + drop_last: true + num_workers: 8 + pin_memory: false + prefetch_factor: 2 + persistent_workers: true + timeout: 0 + + # Optimization + scheduler: + name: cosine_with_warmup + t_warmup: 100ba + alpha_f: 0.1 + + # Note: You may want to change learning rate, betas, weight decay + optimizer: + name: decoupled_lionw + lr: 5.0e-7 + betas: + - 0.9 + - 0.95 + weight_decay: 0.0 + + algorithms: + gradient_clipping: + clipping_type: norm + clipping_threshold: 1.0 + + max_duration: 1ep + eval_first: false + eval_interval: 1ep + eval_subset_num_batches: -1 + global_train_batch_size: 16 + + # System + seed: ${variables.global_seed} + device_eval_batch_size: 1 + device_train_microbatch_size: 1 + precision: amp_bf16 + + # FSDP + fsdp_config: + state_dict_type: sharded # Note: we enable sharded checkpointing to avoid GPU OOM + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + + # Logging + progress_bar: false + log_to_console: true + console_log_interval: 1ba + + callbacks: + speed_monitor: + window_size: 10 + lr_monitor: {} + memory_monitor: {} + runtime_estimator: {} + + load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc + +# loggers: +# wandb: {} + +# Checkpoint to local filesystem or remote object store +# save_interval: 2000ba +# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK +# save_folder: ./{run_name}/checkpoints +# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints + +# Load from local filesystem or remote object store +# load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt +# load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt diff --git a/setup.py b/setup.py index 6aa1553b5d..57acaddf5a 100644 --- a/setup.py +++ b/setup.py @@ -52,11 +52,11 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.27.0,<0.28', + 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.28.0,<0.29', 'mlflow>=2.14.1,<2.19', 'accelerate>=0.25,<1.2', # for HF inference `device_map` 'transformers>=4.43.2,<4.47', - 'mosaicml-streaming>=0.9.0,<0.10', + 'mosaicml-streaming>=0.10.0,<0.11', 'torch>=2.5.1,<2.5.2', 'datasets>=2.20.0,<2.21', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data @@ -91,7 +91,7 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.27.0,<0.28', + 'mosaicml[databricks]>=0.28.0,<0.29', 'numpy<2', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', @@ -99,7 +99,7 @@ ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.27.0,<0.28', + 'mosaicml[tensorboard]>=0.28.0,<0.29', ] # Flash 2 group kept for backwards compatibility @@ -110,11 +110,11 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.27.0,<0.28', + 'mosaicml[peft]>=0.28.0,<0.29', ] extra_deps['openai'] = [ - 'openai==1.3.8', + 'openai>=1.56.0,<2.0', 'tiktoken>=0.4,<0.8.1', ] diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 48713f8a19..8402694672 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from pathlib import Path -from typing import Any +from typing import Any, Callable from unittest.mock import Mock, patch import pytest @@ -161,27 +161,73 @@ def test_dist_auto_packing(profile_packing: Mock): assert packing_ratio == 2 +def get_remote_config( + base_cfg: dict, + remote_dir: str, + local_dir: str, +) -> DictConfig: + return DictConfig({ + **base_cfg, + 'dataset': { + **base_cfg['dataset'], + 'remote': remote_dir, + 'local': local_dir, + }, + }) + + +def get_streams_config( + base_cfg: dict, + remote_dir: str, + local_dir: str, +) -> DictConfig: + return DictConfig({ + **base_cfg, + 'dataset': { + **base_cfg['dataset'], + 'streams': { + 'stream_with_remote': { + 'remote': remote_dir, + 'local': local_dir, + }, + 'stream_without_remote': { + 'local': remote_dir, + }, + }, + }, + }) + + def patched_packing_ratio(*args: Any, **kwargs: Any): from llmfoundry.data.packing import auto_packing_ratio return auto_packing_ratio(*args, **kwargs, num_packing_ratios=4) +@pytest.mark.parametrize( + 'get_config', + [ + get_remote_config, + get_streams_config, + ], +) @patch( 'llmfoundry.data.finetuning.dataloader.auto_packing_ratio', patched_packing_ratio, ) -def test_auto_packing_with_streaming_dataloader(tmp_path: Path): +def test_auto_packing_with_streaming_dataloader( + get_config: Callable[[dict, str, str], DictConfig], + tmp_path: Path, +): columns = {'prompt': 'str', 'response': 'str'} tokenizer = build_tokenizer('gpt2', {}) remote_dir = str(tmp_path / 'remote') local_dir = str(tmp_path / 'local') with MDSWriter(out=remote_dir, columns=columns, compression=None) as out: out.write({'prompt': 'HELLO', 'response': 'WORLD'}) - cfg = DictConfig({ + + base_cfg = { 'dataset': { - 'remote': remote_dir, - 'local': local_dir, 'packing_ratio': 'auto', 'max_seq_len': 200, 'decoder_only_format': True, @@ -194,7 +240,9 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path): 'prefetch_factor': None, 'persistent_workers': False, 'timeout': 0, - }) + } + + cfg = get_config(base_cfg, remote_dir, local_dir) loader = build_finetuning_dataloader( **cfg, @@ -214,7 +262,10 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path): assert isinstance(loader.batch_size, int) assert loader.dataset.packing_ratio == int(loader.batch_size / 6) - state_dict = loader.dataset.state_dict(num_samples=2, from_beginning=False) + state_dict = loader.dataset.state_dict( + num_samples=2, + from_beginning=False, + ) assert state_dict['sample_in_epoch'] == 2 * loader.dataset.packing_ratio