From 19a30cfedec0c7ceb7711a33b0d42f40bf29658e Mon Sep 17 00:00:00 2001 From: Glavin Wiechert Date: Sun, 10 Sep 2023 22:34:37 +0000 Subject: [PATCH] WIP Integrate quantization into finetune script --- examples/llama-2/lora-short.yml | 70 +++++++++++ scripts/finetune.py | 45 +++++++- scripts/quantize.py | 198 +++----------------------------- src/axolotl/utils/quantize.py | 132 +++++++++++++++++++++ 4 files changed, 263 insertions(+), 182 deletions(-) create mode 100644 examples/llama-2/lora-short.yml create mode 100644 src/axolotl/utils/quantize.py diff --git a/examples/llama-2/lora-short.yml b/examples/llama-2/lora-short.yml new file mode 100644 index 0000000000..bd2b51b962 --- /dev/null +++ b/examples/llama-2/lora-short.yml @@ -0,0 +1,70 @@ +base_model: meta-llama/Llama-2-7b-hf +base_model_config: meta-llama/Llama-2-7b-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +# val_set_size: 0.01 +val_set_size: 0.001 +output_dir: ./lora-out + +sequence_len: 4096 +sample_packing: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +# num_epochs: 3 +# num_epochs: 1 +num_epochs: 0.1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/scripts/finetune.py b/scripts/finetune.py index 1ea18b98b8..ecef38bba0 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -27,6 +27,7 @@ from axolotl.utils.models import load_tokenizer from axolotl.utils.tokenization import check_dataset_labels from axolotl.utils.wandb import setup_wandb_env_vars +from axolotl.utils.quantize import get_examples_for_quantization, load_merged_model, quantize_and_save project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) src_dir = os.path.join(project_root, "src") @@ -87,8 +88,14 @@ def do_merge_lora( cfg: DictDefault, cli_args: TrainerCliArgs, ): - model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args) - do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer) + new_cfg = DictDefault({ + **cfg, + 'lora_model_dir': cfg['output_dir'], + 'load_in_8bit': False, + 'load_in_4bit': False, + }) + model, tokenizer = load_model_and_tokenizer(cfg=new_cfg, cli_args=cli_args) + do_merge_lora_model_and_tokenizer(cfg=new_cfg, model=model, tokenizer=tokenizer) def shard( *, @@ -282,7 +289,39 @@ def do_cli(config: Path = Path("examples/"), **kwargs): dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args) if parsed_cli_args.prepare_ds_only: return - train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta) + model, tokenizer = train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta) + # tokenizer = None + should_quantize = True + + if should_quantize: + # Merge model + # do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args) + # do_merge_lora_model_and_tokenizer(cfg=parsed_cfg, model=model, tokenizer=tokenizer) + # new_cfg = parsed_cfg.copy() + # new_cfg['lora_model_dir'] = new_cfg['output_dir'] + # new_cfg['load_in_8bit'] = False + # new_cfg['load_in_4bit'] = False + + # new_cfg = DictDefault({ + # **parsed_cfg, + # 'lora_model_dir': parsed_cfg['output_dir'], + # 'load_in_8bit': False, + # 'load_in_4bit': False, + # }) + # lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False + # do_merge_lora(cfg=new_cfg, cli_args=parsed_cli_args) + + # TODO: release old model from GPU memory + do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args) + + # Load merged model with AutoGPTQ + merged_model = load_merged_model(parsed_cfg) + + # Quantize & save + n_samples = 128 + examples = get_examples_for_quantization(dataset_meta.train_dataset, n_samples) + quantize_and_save(parsed_cfg, merged_model, tokenizer, examples) + if __name__ == "__main__": diff --git a/scripts/quantize.py b/scripts/quantize.py index 56106cc655..ff382b7da1 100644 --- a/scripts/quantize.py +++ b/scripts/quantize.py @@ -23,6 +23,8 @@ # from scripts.finetune import load_cfg from finetune import load_cfg, get_merged_out_dir, do_merge_lora_model_and_tokenizer, load_datasets +from axolotl.utils.quantize import load_merged_model, get_quantized_model, quantize_and_save, push_model, get_quantized_model_id, get_quantized_model_dir, get_examples_for_quantization + configure_logging() LOG = logging.getLogger("axolotl") @@ -39,7 +41,8 @@ print("Done importing...") ## CHANGE BELOW ## -config_path: Path = Path("./examples/llama-2/lora.yml") +# config_path: Path = Path("./examples/llama-2/lora.yml") +config_path: Path = Path("./examples/llama-2/lora-short.yml") # pretrained_model_dir = "facebook/opt-125m" # quantized_model_dir = "opt-125m-4bit" @@ -47,177 +50,12 @@ # huggingface_username = "CHANGE_ME" ## CHANGE ABOVE -quantize_config = BaseQuantizeConfig( - bits=4, # quantize model to 4-bit - group_size=128, # it is recommended to set the value to 128 - desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad -) - -# TEMPLATE = "<|prompt|>{instruction}<|answer|>" -prompter = AlpacaPrompter() - -# def load_data(data_path, tokenizer, n_samples, template=TEMPLATE): -def load_data(data_path, tokenizer, n_samples): - # Load dataset - dataset = load_dataset(data_path) - - if "train" in dataset: - raw_data = dataset["train"] - else: - raw_data = dataset - - # Sample from the dataset if n_samples is provided and less than the dataset size - if n_samples is not None and n_samples < len(raw_data): - raw_data = raw_data.shuffle(seed=42).select(range(n_samples)) - - def tokenize(examples): - instructions = examples["instruction"] - outputs = examples["output"] - - prompts = [] - texts = [] - input_ids = [] - attention_mask = [] - for input_text, output_text in zip(instructions, outputs): - # prompt = template.format(instruction=input_text) - # prompt = next(prompter.build_prompt(instruction=input_text, output=output_text)) - prompt = next(prompter.build_prompt(instruction=input_text)) - text = prompt + output_text - - if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length: - continue - - tokenized_data = tokenizer(text) - - input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length]) - attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length]) - prompts.append(prompt) - texts.append(text) - - return { - "input_ids": input_ids, - "attention_mask": attention_mask, - "prompt": prompts, - "text": texts, - } - - raw_data = raw_data.map( - tokenize, - batched=True, - batch_size=len(raw_data), - num_proc=1, - keep_in_memory=True, - load_from_cache_file=False, - # remove_columns=["instruction", "input"] - ) - - # Convert to PyTorch tensors - raw_data.set_format(type='torch', columns=['input_ids', 'attention_mask']) - - # for sample in dataset: - # sample["input_ids"] = torch.LongTensor(sample["input_ids"]) - # sample["attention_mask"] = torch.LongTensor(sample["attention_mask"]) - - return raw_data - - -# def get_tokenizer(): -# print("Loading tokenizer...") -# # tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) -# tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) -# return tokenizer - -# def get_model(): -def load_merged_model(cfg: DictDefault): - print("Loading model...") - - merged_out_dir = get_merged_out_dir(cfg) - - # Check if the merged model exists - if not merged_out_dir.exists(): - # If not, merge the model - print("Merged model not found. Merging...") - # model, tokenizer = load_model(cfg, inference=True) - # do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer) - raise NotImplementedError("Merging model is not implemented yet.") - - # load un-quantized model, by default, the model will always be loaded into CPU memory - model = AutoGPTQForCausalLM.from_pretrained(merged_out_dir, quantize_config) - # model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) - print("Model loaded.") - return model - -def get_quantized_model(cfg: DictDefault): - print("Loading quantized model...") - quantized_model_dir = get_quantized_model_dir(cfg) - model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_safetensors=True) - print("Model loaded.") - return model - -def quantize_and_save(cfg: DictDefault, model, tokenizer, examples_for_quant): - print("Quantize...") - start = time.time() - # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask" - model.quantize( - examples_for_quant, - batch_size=1, - # batch_size=args.quant_batch_size, - # use_triton=args.use_triton, - # autotune_warmup_after_quantized=args.use_triton - ) - end = time.time() - print(f"quantization took: {end - start: .4f}s") - - # save quantized model - print("Saving quantized model...") - # model.save_quantized(quantized_model_dir) - quantized_model_dir = get_quantized_model_dir(cfg) - model.save_quantized(quantized_model_dir, use_safetensors=True) - print("Saving tokenizer...") - tokenizer.save_pretrained(quantized_model_dir) - print("Saved.") - - return model - -def push_model(cfg: DictDefault, model, tokenizer): -# def push_model(model): - # push quantized model to Hugging Face Hub. - # to use use_auth_token=True, Login first via huggingface-cli login. - # or pass explcit token with: use_auth_token="hf_xxxxxxx" - # (uncomment the following three lines to enable this feature) - # repo_id = f"YourUserName/{quantized_model_dir}" - print("Pushing to Huggingface hub...") - # repo_id = f"{huggingface_username}/{quantized_model_dir}" - repo_id = get_quantized_model_id(cfg) - pretrained_model_dir = cfg['base_model'] - commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}" - # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True, safe_serialization=True) - # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, safe_serialization=True) - model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True) - tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True) - print("Pushed.") - -# def push_tokenizer(tokenizer): - -def get_quantized_model_id(cfg: DictDefault): -# def get_quantized_model_id(cfg: DictDefault, quantize_config): - # return f"{cfg.hub_model_id}-{quantize_config.bits}bits-gr{quantize_config.group_size}-desc_act{quantize_config.desc_act}" - if not cfg.hub_model_id: - raise ValueError("Missing hub_model_id in the configuration.") - return f"{cfg.hub_model_id}-GPTQ" - -def get_quantized_model_dir(cfg: DictDefault): -# def get_quantized_model_dir(cfg: DictDefault, quantize_config): - if not cfg.output_dir: - raise ValueError("Missing output_dir in the configuration.") - return f"{cfg.output_dir.lstrip('./')}-GPTQ" - def main(): print("Starting...") # return # prompt = "<|prompt|>How can entrepreneurs start building their own communities even before launching their product?<|answer|>" - should_quantize = False + should_quantize = True # tokenizer = get_tokenizer() cfg = load_cfg(config_path) @@ -234,13 +72,13 @@ def main(): datasets = load_datasets(cfg=cfg, cli_args=TrainerCliArgs()) train_dataset = datasets.train_dataset n_samples = 128 - # n_samples = 2 - examples = train_dataset.shuffle(seed=42).select( - [ - random.randrange(0, len(train_dataset) - 1) # nosec - for _ in range(n_samples) - ] - ) + # # n_samples = 2 + # examples = train_dataset.shuffle(seed=42).select( + # [ + # random.randrange(0, len(train_dataset) - 1) # nosec + # for _ in range(n_samples) + # ] + # ) LOG.info("loading model and (optionally) peft_config...") # model, peft_config = load_model(cfg, tokenizer, inference=True) @@ -250,11 +88,12 @@ def main(): # examples = load_data(dataset_name, tokenizer, n_samples) # print(examples) - examples_for_quant = [ - {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} - for example in examples - ] + # examples_for_quant = [ + # {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} + # for example in examples + # ] # print(examples_for_quant) + examples_for_quant = get_examples_for_quantization(train_dataset, n_samples) modelq = quantize_and_save(cfg, model, tokenizer, examples_for_quant) else: @@ -263,7 +102,8 @@ def main(): push_model(cfg, modelq, tokenizer) -main() +if __name__ == "__main__": + main() # Load configure diff --git a/src/axolotl/utils/quantize.py b/src/axolotl/utils/quantize.py new file mode 100644 index 0000000000..f8214bda2a --- /dev/null +++ b/src/axolotl/utils/quantize.py @@ -0,0 +1,132 @@ +# pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ + +# import debugpy +# debugpy.listen(('0.0.0.0', 5678)) +# debugpy.wait_for_client() +# debugpy.breakpoint() + +import json +import random +import time +from pathlib import Path +import logging + +# import torch +# from datasets import load_dataset, Dataset +# from transformers import AutoTokenizer, LlamaTokenizer, TextGenerationPipeline +from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig +from axolotl.prompters import AlpacaPrompter +from axolotl.utils.models import load_model, load_tokenizer +from axolotl.common.cli import TrainerCliArgs +from axolotl.logging_config import configure_logging +from axolotl.utils.dict import DictDefault +# from finetune import load_cfg, get_merged_out_dir, do_merge_lora_model_and_tokenizer + +# configure_logging() +# LOG = logging.getLogger("axolotl") + +quantize_config = BaseQuantizeConfig( + bits=4, # quantize model to 4-bit + group_size=128, # it is recommended to set the value to 128 + desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad +) + +def get_merged_out_dir(cfg: DictDefault): + return Path(cfg.output_dir) / "merged" + +def load_merged_model(cfg: DictDefault): + print("Loading merged model...") + + merged_out_dir = get_merged_out_dir(cfg) + + # Check if the merged model exists + if not merged_out_dir.exists(): + # If not, merge the model + print("Merged model not found. Merging...") + # model, tokenizer = load_model(cfg, inference=True) + # do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer) + raise NotImplementedError("Merging model is not implemented yet.") + + # load un-quantized model, by default, the model will always be loaded into CPU memory + model = AutoGPTQForCausalLM.from_pretrained(merged_out_dir, quantize_config) + # model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) + print("Model loaded.") + return model + +def get_quantized_model(cfg: DictDefault): + print("Loading quantized model...") + quantized_model_dir = get_quantized_model_dir(cfg) + model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_safetensors=True) + print("Model loaded.") + return model + +def quantize_and_save(cfg: DictDefault, model, tokenizer, examples_for_quant): + print("Quantize...") + start = time.time() + # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask" + model.quantize( + examples_for_quant, + batch_size=1, + # batch_size=args.quant_batch_size, + # use_triton=args.use_triton, + # autotune_warmup_after_quantized=args.use_triton + ) + end = time.time() + print(f"quantization took: {end - start: .4f}s") + + # save quantized model + print("Saving quantized model...") + # model.save_quantized(quantized_model_dir) + quantized_model_dir = get_quantized_model_dir(cfg) + model.save_quantized(quantized_model_dir, use_safetensors=True) + print("Saving tokenizer...") + tokenizer.save_pretrained(quantized_model_dir) + print("Saved.") + + return model + +def push_model(cfg: DictDefault, model, tokenizer): +# def push_model(model): + # push quantized model to Hugging Face Hub. + # to use use_auth_token=True, Login first via huggingface-cli login. + # or pass explcit token with: use_auth_token="hf_xxxxxxx" + # (uncomment the following three lines to enable this feature) + # repo_id = f"YourUserName/{quantized_model_dir}" + print("Pushing to Huggingface hub...") + # repo_id = f"{huggingface_username}/{quantized_model_dir}" + repo_id = get_quantized_model_id(cfg) + pretrained_model_dir = cfg['base_model'] + commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}" + # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True, safe_serialization=True) + # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, safe_serialization=True) + model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True) + tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True) + print("Pushed.") + +def get_quantized_model_id(cfg: DictDefault): +# def get_quantized_model_id(cfg: DictDefault, quantize_config): + # return f"{cfg.hub_model_id}-{quantize_config.bits}bits-gr{quantize_config.group_size}-desc_act{quantize_config.desc_act}" + if not cfg.hub_model_id: + raise ValueError("Missing hub_model_id in the configuration.") + return f"{cfg.hub_model_id}-GPTQ" + +def get_quantized_model_dir(cfg: DictDefault): +# def get_quantized_model_dir(cfg: DictDefault, quantize_config): + if not cfg.output_dir: + raise ValueError("Missing output_dir in the configuration.") + return f"{cfg.output_dir.lstrip('./')}-GPTQ" + +def get_examples_for_quantization(dataset, n_samples): + print("Loading dataset...") + examples = dataset.shuffle(seed=42).select( + [ + random.randrange(0, len(dataset) - 1) # nosec + for _ in range(n_samples) + ] + ) + + examples_for_quant = [ + {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} + for example in examples + ] + return examples_for_quant