From f62e23737bc54dbef758fa8c58296ee0b1023e7b Mon Sep 17 00:00:00 2001 From: Sunny Liu Date: Thu, 17 Oct 2024 15:15:29 -0400 Subject: [PATCH] memoize dataset length for eval sample packing (#1974) * wip on multimodal sample packing support * wip on multimodal packing support * llama-1b-yml * setup logging for test * yml * yml * yml * fix for __len__ for eval sample packing * reverted irrelavant changes * reformatted, reverted log message * reverted unnecessary changes * added e2e multigpu testing for eval sample packing * formatting * fixed e2e test_eval params * fix test_eval e2e multigpu * fix test_eval e2e multigpu * Update tests/e2e/multigpu/test_eval.py Co-authored-by: Wing Lian * Update tests/e2e/multigpu/test_eval.py Co-authored-by: Wing Lian --------- Co-authored-by: Wing Lian --- examples/llama-3/qlora-1b.yml | 77 ++++++++++++ src/axolotl/utils/samplers/multipack.py | 13 +- tests/e2e/multigpu/test_eval.py | 155 ++++++++++++++++++++++++ 3 files changed, 239 insertions(+), 6 deletions(-) create mode 100644 examples/llama-3/qlora-1b.yml create mode 100644 tests/e2e/multigpu/test_eval.py diff --git a/examples/llama-3/qlora-1b.yml b/examples/llama-3/qlora-1b.yml new file mode 100644 index 0000000000..fdfe4aa7c8 --- /dev/null +++ b/examples/llama-3/qlora-1b.yml @@ -0,0 +1,77 @@ +base_model: meta-llama/Llama-3.2-1B + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: teknium/GPT4-LLM-Cleaned + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.1 +output_dir: ./outputs/qlora-out + +adapter: qlora +lora_model_dir: + +sequence_len: 2048 +sample_packing: true +eval_sample_packing: true +pad_to_sequence_len: true + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +loss_watchdog_threshold: 5.0 +loss_watchdog_patience: 3 + +warmup_steps: 10 +evals_per_epoch: 4 +eval_table_size: +eval_max_new_tokens: 128 +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + pad_token: "<|end_of_text|>" diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py index 205c2894d1..db14a6819e 100644 --- a/src/axolotl/utils/samplers/multipack.py +++ b/src/axolotl/utils/samplers/multipack.py @@ -133,6 +133,8 @@ def __init__( self.eff_total_used = 0 self.eff_total_slots = 0 + self.len_across_ranks = None + def set_epoch(self, epoch: int): self.epoch = epoch @@ -195,15 +197,14 @@ def calc_min_len(estimates: list[(int, float)]): LOG.info(f"gather_len_batches: {repr(estimates)}") return math.floor(0.998 * min(estimates)) - min_len_batches = reduce_and_broadcast( - lambda: num, - calc_min_len, - ) + min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len) return min_len_batches def __len__(self): - len_batches = self.num_batches() - return self.gather_len_batches(len_batches) + if not self.len_across_ranks: + len_batches = self.num_batches() + self.len_across_ranks = self.gather_len_batches(len_batches) + return self.len_across_ranks def _len_est(self): efficiency = ( diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py new file mode 100644 index 0000000000..65d26bb824 --- /dev/null +++ b/tests/e2e/multigpu/test_eval.py @@ -0,0 +1,155 @@ +""" +E2E tests for multigpu eval +""" +import logging +import os +import unittest +from pathlib import Path + +import yaml +from accelerate.test_utils import execute_subprocess_async + +from axolotl.utils.dict import DictDefault + +from ..utils import with_temp_dir + +LOG = logging.getLogger("axolotl.tests.e2e.multigpu") +os.environ["WANDB_DISABLED"] = "true" + +AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent + + +class TestMultiGPUEval(unittest.TestCase): + """ + Test case for MultiGPU Eval Sample Packing + """ + + @with_temp_dir + def test_eval_sample_packing(self, temp_dir): + # pylint: disable=duplicate-code + cfg = DictDefault( + { + "base_model": "JackFram/llama-68m", + "load_in_8bit": False, + "load_in_4bit": True, + "strict": False, + "sequence_len": 2048, + "adapter": "qlora", + "sample_packing": True, + "eval_sample_packing": True, + "pad_to_sequence_len": True, + "lora_r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_linear": True, + "lora_modules_to_save": ["embed_tokens", "lm_head"], + "val_set_size": 0.1, + "special_tokens": {"pad_token": "<|end_of_text|>"}, + "datasets": [ + { + "path": "teknium/GPT4-LLM-Cleaned", + "type": "alpaca", + }, + ], + "num_epochs": 1, + "max_steps": 5, + "micro_batch_size": 2, + "gradient_accumulation_steps": 4, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_8bit", + "lr_scheduler": "cosine", + "flash_attention": True, + "loss_watchdog_threshold": 5.0, + "loss_watchdog_patience": 3, + "bf16": "auto", + "warmup_steps": 1, + "evals_per_epoch": 2, + "eval_max_new_tokens": 128, + "saves_per_epoch": 1, + "logging_steps": 1, + "weight_decay": 0.0, + } + ) + + # write cfg to yaml file + Path(temp_dir).mkdir(parents=True, exist_ok=True) + with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: + fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) + + execute_subprocess_async( + [ + "accelerate", + "launch", + "--num-processes", + "2", + "-m", + "axolotl.cli.train", + str(Path(temp_dir) / "config.yaml"), + ] + ) + + @with_temp_dir + def test_eval(self, temp_dir): + # pylint: disable=duplicate-code + cfg = DictDefault( + { + "base_model": "JackFram/llama-68m", + "load_in_8bit": False, + "load_in_4bit": True, + "strict": False, + "sequence_len": 2048, + "adapter": "qlora", + "sample_packing": True, + "eval_sample_packing": False, + "pad_to_sequence_len": True, + "lora_r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_linear": True, + "lora_modules_to_save": ["embed_tokens", "lm_head"], + "val_set_size": 0.1, + "special_tokens": {"pad_token": "<|end_of_text|>"}, + "datasets": [ + { + "path": "teknium/GPT4-LLM-Cleaned", + "type": "alpaca", + }, + ], + "num_epochs": 1, + "max_steps": 5, + "micro_batch_size": 2, + "gradient_accumulation_steps": 4, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_8bit", + "lr_scheduler": "cosine", + "flash_attention": True, + "loss_watchdog_threshold": 5.0, + "loss_watchdog_patience": 3, + "bf16": "auto", + "warmup_steps": 1, + "evals_per_epoch": 2, + "eval_max_new_tokens": 128, + "saves_per_epoch": 1, + "logging_steps": 1, + "weight_decay": 0.0, + } + ) + + # write cfg to yaml file + Path(temp_dir).mkdir(parents=True, exist_ok=True) + with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: + fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) + + execute_subprocess_async( + [ + "accelerate", + "launch", + "--num-processes", + "2", + "-m", + "axolotl.cli.train", + str(Path(temp_dir) / "config.yaml"), + ] + )