From 19a30cfedec0c7ceb7711a33b0d42f40bf29658e Mon Sep 17 00:00:00 2001
From: Glavin Wiechert <glavin.wiechert@gmail.com>
Date: Sun, 10 Sep 2023 22:34:37 +0000
Subject: [PATCH] WIP Integrate quantization into finetune script

---
 examples/llama-2/lora-short.yml |  70 +++++++++++
 scripts/finetune.py             |  45 +++++++-
 scripts/quantize.py             | 198 +++-----------------------------
 src/axolotl/utils/quantize.py   | 132 +++++++++++++++++++++
 4 files changed, 263 insertions(+), 182 deletions(-)
 create mode 100644 examples/llama-2/lora-short.yml
 create mode 100644 src/axolotl/utils/quantize.py
diff --git a/examples/llama-2/lora-short.yml b/examples/llama-2/lora-short.yml
new file mode 100644
index 0000000000..bd2b51b962
--- /dev/null
+++ b/examples/llama-2/lora-short.yml
@@ -0,0 +1,70 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+# val_set_size: 0.01
+val_set_size: 0.001
+output_dir: ./lora-out
+
+sequence_len: 4096
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+# num_epochs: 3
+# num_epochs: 1
+num_epochs: 0.1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 1ea18b98b8..ecef38bba0 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -27,6 +27,7 @@
 from axolotl.utils.models import load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.wandb import setup_wandb_env_vars
+from axolotl.utils.quantize import get_examples_for_quantization, load_merged_model, quantize_and_save
 
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
@@ -87,8 +88,14 @@ def do_merge_lora(
     cfg: DictDefault,
     cli_args: TrainerCliArgs,
 ):
-    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
-    do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer)
+    new_cfg = DictDefault({
+        **cfg,
+        'lora_model_dir': cfg['output_dir'],
+        'load_in_8bit': False,
+        'load_in_4bit': False,
+    })
+    model, tokenizer = load_model_and_tokenizer(cfg=new_cfg, cli_args=cli_args)
+    do_merge_lora_model_and_tokenizer(cfg=new_cfg, model=model, tokenizer=tokenizer)
 
 def shard(
     *,
@@ -282,7 +289,39 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
         dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
         if parsed_cli_args.prepare_ds_only:
             return
-        train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
+        model, tokenizer = train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
+        # tokenizer = None
+        should_quantize = True
+
+        if should_quantize:
+            # Merge model
+            # do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)
+            # do_merge_lora_model_and_tokenizer(cfg=parsed_cfg, model=model, tokenizer=tokenizer)
+            # new_cfg = parsed_cfg.copy()
+            # new_cfg['lora_model_dir'] = new_cfg['output_dir']
+            # new_cfg['load_in_8bit'] = False
+            # new_cfg['load_in_4bit'] = False
+
+            # new_cfg = DictDefault({
+            #     **parsed_cfg,
+            #     'lora_model_dir': parsed_cfg['output_dir'],
+            #     'load_in_8bit': False,
+            #     'load_in_4bit': False,
+            # })
+            # lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
+            # do_merge_lora(cfg=new_cfg, cli_args=parsed_cli_args)
+            
+            # TODO: release old model from GPU memory
+            do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)
+
+            # Load merged model with AutoGPTQ
+            merged_model = load_merged_model(parsed_cfg)
+
+            # Quantize & save
+            n_samples = 128
+            examples = get_examples_for_quantization(dataset_meta.train_dataset, n_samples)
+            quantize_and_save(parsed_cfg, merged_model, tokenizer, examples)
+
 
 
 if __name__ == "__main__":
diff --git a/scripts/quantize.py b/scripts/quantize.py
index 56106cc655..ff382b7da1 100644
--- a/scripts/quantize.py
+++ b/scripts/quantize.py
@@ -23,6 +23,8 @@
 # from scripts.finetune import load_cfg
 from finetune import load_cfg, get_merged_out_dir, do_merge_lora_model_and_tokenizer, load_datasets
 
+from axolotl.utils.quantize import load_merged_model, get_quantized_model, quantize_and_save, push_model, get_quantized_model_id, get_quantized_model_dir, get_examples_for_quantization
+
 configure_logging()
 LOG = logging.getLogger("axolotl")
 
@@ -39,7 +41,8 @@
 print("Done importing...")
 
 ## CHANGE BELOW ##
-config_path: Path = Path("./examples/llama-2/lora.yml")
+# config_path: Path = Path("./examples/llama-2/lora.yml")
+config_path: Path = Path("./examples/llama-2/lora-short.yml")
 
 # pretrained_model_dir = "facebook/opt-125m"
 # quantized_model_dir = "opt-125m-4bit"
@@ -47,177 +50,12 @@
 # huggingface_username = "CHANGE_ME"
 ## CHANGE ABOVE
 
-quantize_config = BaseQuantizeConfig(
-    bits=4,  # quantize model to 4-bit
-    group_size=128,  # it is recommended to set the value to 128
-    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad 
-)
-
-# TEMPLATE = "<|prompt|>{instruction}</s><|answer|>"
-prompter = AlpacaPrompter()
-
-# def load_data(data_path, tokenizer, n_samples, template=TEMPLATE):
-def load_data(data_path, tokenizer, n_samples):
-    # Load dataset
-    dataset = load_dataset(data_path)
-    
-    if "train" in dataset:
-        raw_data = dataset["train"]
-    else:
-        raw_data = dataset
-
-    # Sample from the dataset if n_samples is provided and less than the dataset size
-    if n_samples is not None and n_samples < len(raw_data):
-        raw_data = raw_data.shuffle(seed=42).select(range(n_samples))
-
-    def tokenize(examples):
-        instructions = examples["instruction"]
-        outputs = examples["output"]
-
-        prompts = []
-        texts = []
-        input_ids = []
-        attention_mask = []
-        for input_text, output_text in zip(instructions, outputs):
-            # prompt = template.format(instruction=input_text)
-            # prompt = next(prompter.build_prompt(instruction=input_text, output=output_text))
-            prompt = next(prompter.build_prompt(instruction=input_text))
-            text = prompt + output_text
-
-            if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length:
-                continue
-
-            tokenized_data = tokenizer(text)
-
-            input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])
-            attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])
-            prompts.append(prompt)
-            texts.append(text)
-
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "prompt": prompts,
-            "text": texts,
-        }
-
-    raw_data = raw_data.map(
-        tokenize,
-        batched=True,
-        batch_size=len(raw_data),
-        num_proc=1,
-        keep_in_memory=True,
-        load_from_cache_file=False,
-        # remove_columns=["instruction", "input"]
-    )
-
-    # Convert to PyTorch tensors
-    raw_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])
-
-    # for sample in dataset:
-    #     sample["input_ids"] = torch.LongTensor(sample["input_ids"])
-    #     sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])
-
-    return raw_data
-
-
-# def get_tokenizer():
-#     print("Loading tokenizer...")
-#     # tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-#     tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-#     return tokenizer
-
-# def get_model():
-def load_merged_model(cfg: DictDefault):
-    print("Loading model...")
-
-    merged_out_dir = get_merged_out_dir(cfg)
-    
-    # Check if the merged model exists
-    if not merged_out_dir.exists():
-        # If not, merge the model
-        print("Merged model not found. Merging...")
-        # model, tokenizer = load_model(cfg, inference=True)
-        # do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer)
-        raise NotImplementedError("Merging model is not implemented yet.")
-
-    # load un-quantized model, by default, the model will always be loaded into CPU memory
-    model = AutoGPTQForCausalLM.from_pretrained(merged_out_dir, quantize_config)
-    # model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
-    print("Model loaded.")
-    return model
-
-def get_quantized_model(cfg: DictDefault):
-    print("Loading quantized model...")
-    quantized_model_dir = get_quantized_model_dir(cfg)
-    model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_safetensors=True)
-    print("Model loaded.")
-    return model
-
-def quantize_and_save(cfg: DictDefault, model, tokenizer, examples_for_quant):
-    print("Quantize...")
-    start = time.time()
-    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
-    model.quantize(
-        examples_for_quant,
-        batch_size=1,
-        # batch_size=args.quant_batch_size,
-        # use_triton=args.use_triton,
-        # autotune_warmup_after_quantized=args.use_triton
-    )
-    end = time.time()
-    print(f"quantization took: {end - start: .4f}s")
-
-    # save quantized model
-    print("Saving quantized model...")
-    # model.save_quantized(quantized_model_dir)
-    quantized_model_dir = get_quantized_model_dir(cfg)
-    model.save_quantized(quantized_model_dir, use_safetensors=True)
-    print("Saving tokenizer...")
-    tokenizer.save_pretrained(quantized_model_dir)
-    print("Saved.")
-
-    return model
-
-def push_model(cfg: DictDefault, model, tokenizer):
-# def push_model(model):
-    # push quantized model to Hugging Face Hub. 
-    # to use use_auth_token=True, Login first via huggingface-cli login.
-    # or pass explcit token with: use_auth_token="hf_xxxxxxx"
-    # (uncomment the following three lines to enable this feature)
-    # repo_id = f"YourUserName/{quantized_model_dir}"
-    print("Pushing to Huggingface hub...")
-    # repo_id = f"{huggingface_username}/{quantized_model_dir}"
-    repo_id = get_quantized_model_id(cfg)
-    pretrained_model_dir = cfg['base_model']
-    commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
-    # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True, safe_serialization=True)
-    # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, safe_serialization=True)
-    model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True)
-    tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
-    print("Pushed.")
-
-# def push_tokenizer(tokenizer):
-
-def get_quantized_model_id(cfg: DictDefault):
-# def get_quantized_model_id(cfg: DictDefault, quantize_config):
-    # return f"{cfg.hub_model_id}-{quantize_config.bits}bits-gr{quantize_config.group_size}-desc_act{quantize_config.desc_act}"
-    if not cfg.hub_model_id:
-        raise ValueError("Missing hub_model_id in the configuration.")
-    return f"{cfg.hub_model_id}-GPTQ"
-
-def get_quantized_model_dir(cfg: DictDefault):
-# def get_quantized_model_dir(cfg: DictDefault, quantize_config):
-    if not cfg.output_dir:
-        raise ValueError("Missing output_dir in the configuration.")
-    return f"{cfg.output_dir.lstrip('./')}-GPTQ"
-
 def main():
     print("Starting...")
     # return
     # prompt = "<|prompt|>How can entrepreneurs start building their own communities even before launching their product?</s><|answer|>"
 
-    should_quantize = False
+    should_quantize = True
     # tokenizer = get_tokenizer()
 
     cfg = load_cfg(config_path)
@@ -234,13 +72,13 @@ def main():
         datasets = load_datasets(cfg=cfg, cli_args=TrainerCliArgs())
         train_dataset = datasets.train_dataset
         n_samples = 128
-        # n_samples = 2
-        examples = train_dataset.shuffle(seed=42).select(
-                [
-                    random.randrange(0, len(train_dataset) - 1)  # nosec
-                    for _ in range(n_samples)
-                ]
-            )
+        # # n_samples = 2
+        # examples = train_dataset.shuffle(seed=42).select(
+        #         [
+        #             random.randrange(0, len(train_dataset) - 1)  # nosec
+        #             for _ in range(n_samples)
+        #         ]
+        #     )
 
         LOG.info("loading model and (optionally) peft_config...")
         # model, peft_config = load_model(cfg, tokenizer, inference=True)
@@ -250,11 +88,12 @@ def main():
         # examples = load_data(dataset_name, tokenizer, n_samples)
 
         # print(examples)
-        examples_for_quant = [
-            {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
-            for example in examples
-        ]
+        # examples_for_quant = [
+        #     {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
+        #     for example in examples
+        # ]
         # print(examples_for_quant)
+        examples_for_quant = get_examples_for_quantization(train_dataset, n_samples)
 
         modelq = quantize_and_save(cfg, model, tokenizer, examples_for_quant)
     else:
@@ -263,7 +102,8 @@ def main():
 
     push_model(cfg, modelq, tokenizer)
 
-main()
+if __name__ == "__main__":
+    main()
 
 
 # Load configure
diff --git a/src/axolotl/utils/quantize.py b/src/axolotl/utils/quantize.py
new file mode 100644
index 0000000000..f8214bda2a
--- /dev/null
+++ b/src/axolotl/utils/quantize.py
@@ -0,0 +1,132 @@
+# pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
+
+# import debugpy
+# debugpy.listen(('0.0.0.0', 5678))
+# debugpy.wait_for_client()
+# debugpy.breakpoint()
+
+import json
+import random
+import time
+from pathlib import Path
+import logging
+
+# import torch
+# from datasets import load_dataset, Dataset
+# from transformers import AutoTokenizer, LlamaTokenizer, TextGenerationPipeline
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from axolotl.prompters import AlpacaPrompter
+from axolotl.utils.models import load_model, load_tokenizer
+from axolotl.common.cli import TrainerCliArgs
+from axolotl.logging_config import configure_logging
+from axolotl.utils.dict import DictDefault
+# from finetune import load_cfg, get_merged_out_dir, do_merge_lora_model_and_tokenizer
+
+# configure_logging()
+# LOG = logging.getLogger("axolotl")
+
+quantize_config = BaseQuantizeConfig(
+    bits=4,  # quantize model to 4-bit
+    group_size=128,  # it is recommended to set the value to 128
+    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad 
+)
+
+def get_merged_out_dir(cfg: DictDefault):
+    return Path(cfg.output_dir) / "merged"
+
+def load_merged_model(cfg: DictDefault):
+    print("Loading merged model...")
+
+    merged_out_dir = get_merged_out_dir(cfg)
+
+    # Check if the merged model exists
+    if not merged_out_dir.exists():
+        # If not, merge the model
+        print("Merged model not found. Merging...")
+        # model, tokenizer = load_model(cfg, inference=True)
+        # do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer)
+        raise NotImplementedError("Merging model is not implemented yet.")
+
+    # load un-quantized model, by default, the model will always be loaded into CPU memory
+    model = AutoGPTQForCausalLM.from_pretrained(merged_out_dir, quantize_config)
+    # model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+    print("Model loaded.")
+    return model
+
+def get_quantized_model(cfg: DictDefault):
+    print("Loading quantized model...")
+    quantized_model_dir = get_quantized_model_dir(cfg)
+    model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_safetensors=True)
+    print("Model loaded.")
+    return model
+
+def quantize_and_save(cfg: DictDefault, model, tokenizer, examples_for_quant):
+    print("Quantize...")
+    start = time.time()
+    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
+    model.quantize(
+        examples_for_quant,
+        batch_size=1,
+        # batch_size=args.quant_batch_size,
+        # use_triton=args.use_triton,
+        # autotune_warmup_after_quantized=args.use_triton
+    )
+    end = time.time()
+    print(f"quantization took: {end - start: .4f}s")
+
+    # save quantized model
+    print("Saving quantized model...")
+    # model.save_quantized(quantized_model_dir)
+    quantized_model_dir = get_quantized_model_dir(cfg)
+    model.save_quantized(quantized_model_dir, use_safetensors=True)
+    print("Saving tokenizer...")
+    tokenizer.save_pretrained(quantized_model_dir)
+    print("Saved.")
+
+    return model
+
+def push_model(cfg: DictDefault, model, tokenizer):
+# def push_model(model):
+    # push quantized model to Hugging Face Hub. 
+    # to use use_auth_token=True, Login first via huggingface-cli login.
+    # or pass explcit token with: use_auth_token="hf_xxxxxxx"
+    # (uncomment the following three lines to enable this feature)
+    # repo_id = f"YourUserName/{quantized_model_dir}"
+    print("Pushing to Huggingface hub...")
+    # repo_id = f"{huggingface_username}/{quantized_model_dir}"
+    repo_id = get_quantized_model_id(cfg)
+    pretrained_model_dir = cfg['base_model']
+    commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+    # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True, safe_serialization=True)
+    # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, safe_serialization=True)
+    model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True)
+    tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
+    print("Pushed.")
+
+def get_quantized_model_id(cfg: DictDefault):
+# def get_quantized_model_id(cfg: DictDefault, quantize_config):
+    # return f"{cfg.hub_model_id}-{quantize_config.bits}bits-gr{quantize_config.group_size}-desc_act{quantize_config.desc_act}"
+    if not cfg.hub_model_id:
+        raise ValueError("Missing hub_model_id in the configuration.")
+    return f"{cfg.hub_model_id}-GPTQ"
+
+def get_quantized_model_dir(cfg: DictDefault):
+# def get_quantized_model_dir(cfg: DictDefault, quantize_config):
+    if not cfg.output_dir:
+        raise ValueError("Missing output_dir in the configuration.")
+    return f"{cfg.output_dir.lstrip('./')}-GPTQ"
+
+def get_examples_for_quantization(dataset, n_samples):
+    print("Loading dataset...")
+    examples = dataset.shuffle(seed=42).select(
+            [
+                random.randrange(0, len(dataset) - 1)  # nosec
+                for _ in range(n_samples)
+            ]
+        )
+
+    examples_for_quant = [
+        {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
+        for example in examples
+    ]
+    return examples_for_quant