Skip to content

Commit

Permalink
WIP Integrate quantization into finetune script
Browse files Browse the repository at this point in the history
  • Loading branch information
Glavin001 committed Sep 11, 2023
1 parent e4c1a2e commit 19a30cf
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 182 deletions.
70 changes: 70 additions & 0 deletions examples/llama-2/lora-short.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
base_model: meta-llama/Llama-2-7b-hf
base_model_config: meta-llama/Llama-2-7b-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
is_llama_derived_model: true

load_in_8bit: true
load_in_4bit: false
strict: false

datasets:
- path: mhenrichsen/alpaca_2k_test
type: alpaca
dataset_prepared_path: last_run_prepared
# val_set_size: 0.01
val_set_size: 0.001
output_dir: ./lora-out

sequence_len: 4096
sample_packing: true

adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:

wandb_project:
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
# num_epochs: 3
# num_epochs: 1
num_epochs: 0.1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true

warmup_steps: 10
eval_steps: 20
save_steps:
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
bos_token: "<s>"
eos_token: "</s>"
unk_token: "<unk>"
45 changes: 42 additions & 3 deletions scripts/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from axolotl.utils.models import load_tokenizer
from axolotl.utils.tokenization import check_dataset_labels
from axolotl.utils.wandb import setup_wandb_env_vars
from axolotl.utils.quantize import get_examples_for_quantization, load_merged_model, quantize_and_save

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
Expand Down Expand Up @@ -87,8 +88,14 @@ def do_merge_lora(
cfg: DictDefault,
cli_args: TrainerCliArgs,
):
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer)
new_cfg = DictDefault({
**cfg,
'lora_model_dir': cfg['output_dir'],
'load_in_8bit': False,
'load_in_4bit': False,
})
model, tokenizer = load_model_and_tokenizer(cfg=new_cfg, cli_args=cli_args)
do_merge_lora_model_and_tokenizer(cfg=new_cfg, model=model, tokenizer=tokenizer)

def shard(
*,
Expand Down Expand Up @@ -282,7 +289,39 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
if parsed_cli_args.prepare_ds_only:
return
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
model, tokenizer = train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
# tokenizer = None
should_quantize = True

if should_quantize:
# Merge model
# do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)
# do_merge_lora_model_and_tokenizer(cfg=parsed_cfg, model=model, tokenizer=tokenizer)
# new_cfg = parsed_cfg.copy()
# new_cfg['lora_model_dir'] = new_cfg['output_dir']
# new_cfg['load_in_8bit'] = False
# new_cfg['load_in_4bit'] = False

# new_cfg = DictDefault({
# **parsed_cfg,
# 'lora_model_dir': parsed_cfg['output_dir'],
# 'load_in_8bit': False,
# 'load_in_4bit': False,
# })
# lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
# do_merge_lora(cfg=new_cfg, cli_args=parsed_cli_args)

# TODO: release old model from GPU memory
do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)

# Load merged model with AutoGPTQ
merged_model = load_merged_model(parsed_cfg)

# Quantize & save
n_samples = 128
examples = get_examples_for_quantization(dataset_meta.train_dataset, n_samples)
quantize_and_save(parsed_cfg, merged_model, tokenizer, examples)



if __name__ == "__main__":
Expand Down
198 changes: 19 additions & 179 deletions scripts/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
# from scripts.finetune import load_cfg
from finetune import load_cfg, get_merged_out_dir, do_merge_lora_model_and_tokenizer, load_datasets

from axolotl.utils.quantize import load_merged_model, get_quantized_model, quantize_and_save, push_model, get_quantized_model_id, get_quantized_model_dir, get_examples_for_quantization

configure_logging()
LOG = logging.getLogger("axolotl")

Expand All @@ -39,185 +41,21 @@
print("Done importing...")

## CHANGE BELOW ##
config_path: Path = Path("./examples/llama-2/lora.yml")
# config_path: Path = Path("./examples/llama-2/lora.yml")
config_path: Path = Path("./examples/llama-2/lora-short.yml")

# pretrained_model_dir = "facebook/opt-125m"
# quantized_model_dir = "opt-125m-4bit"
dataset_name = "teknium/GPT4-LLM-Cleaned"
# huggingface_username = "CHANGE_ME"
## CHANGE ABOVE

quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)

# TEMPLATE = "<|prompt|>{instruction}</s><|answer|>"
prompter = AlpacaPrompter()

# def load_data(data_path, tokenizer, n_samples, template=TEMPLATE):
def load_data(data_path, tokenizer, n_samples):
# Load dataset
dataset = load_dataset(data_path)

if "train" in dataset:
raw_data = dataset["train"]
else:
raw_data = dataset

# Sample from the dataset if n_samples is provided and less than the dataset size
if n_samples is not None and n_samples < len(raw_data):
raw_data = raw_data.shuffle(seed=42).select(range(n_samples))

def tokenize(examples):
instructions = examples["instruction"]
outputs = examples["output"]

prompts = []
texts = []
input_ids = []
attention_mask = []
for input_text, output_text in zip(instructions, outputs):
# prompt = template.format(instruction=input_text)
# prompt = next(prompter.build_prompt(instruction=input_text, output=output_text))
prompt = next(prompter.build_prompt(instruction=input_text))
text = prompt + output_text

if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length:
continue

tokenized_data = tokenizer(text)

input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])
attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])
prompts.append(prompt)
texts.append(text)

return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"prompt": prompts,
"text": texts,
}

raw_data = raw_data.map(
tokenize,
batched=True,
batch_size=len(raw_data),
num_proc=1,
keep_in_memory=True,
load_from_cache_file=False,
# remove_columns=["instruction", "input"]
)

# Convert to PyTorch tensors
raw_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# for sample in dataset:
# sample["input_ids"] = torch.LongTensor(sample["input_ids"])
# sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])

return raw_data


# def get_tokenizer():
# print("Loading tokenizer...")
# # tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
# tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
# return tokenizer

# def get_model():
def load_merged_model(cfg: DictDefault):
print("Loading model...")

merged_out_dir = get_merged_out_dir(cfg)

# Check if the merged model exists
if not merged_out_dir.exists():
# If not, merge the model
print("Merged model not found. Merging...")
# model, tokenizer = load_model(cfg, inference=True)
# do_merge_lora_model_and_tokenizer(cfg=cfg, model=model, tokenizer=tokenizer)
raise NotImplementedError("Merging model is not implemented yet.")

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(merged_out_dir, quantize_config)
# model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
print("Model loaded.")
return model

def get_quantized_model(cfg: DictDefault):
print("Loading quantized model...")
quantized_model_dir = get_quantized_model_dir(cfg)
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_safetensors=True)
print("Model loaded.")
return model

def quantize_and_save(cfg: DictDefault, model, tokenizer, examples_for_quant):
print("Quantize...")
start = time.time()
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(
examples_for_quant,
batch_size=1,
# batch_size=args.quant_batch_size,
# use_triton=args.use_triton,
# autotune_warmup_after_quantized=args.use_triton
)
end = time.time()
print(f"quantization took: {end - start: .4f}s")

# save quantized model
print("Saving quantized model...")
# model.save_quantized(quantized_model_dir)
quantized_model_dir = get_quantized_model_dir(cfg)
model.save_quantized(quantized_model_dir, use_safetensors=True)
print("Saving tokenizer...")
tokenizer.save_pretrained(quantized_model_dir)
print("Saved.")

return model

def push_model(cfg: DictDefault, model, tokenizer):
# def push_model(model):
# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
print("Pushing to Huggingface hub...")
# repo_id = f"{huggingface_username}/{quantized_model_dir}"
repo_id = get_quantized_model_id(cfg)
pretrained_model_dir = cfg['base_model']
commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True, safe_serialization=True)
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, safe_serialization=True)
model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True, use_safetensors=True)
tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
print("Pushed.")

# def push_tokenizer(tokenizer):

def get_quantized_model_id(cfg: DictDefault):
# def get_quantized_model_id(cfg: DictDefault, quantize_config):
# return f"{cfg.hub_model_id}-{quantize_config.bits}bits-gr{quantize_config.group_size}-desc_act{quantize_config.desc_act}"
if not cfg.hub_model_id:
raise ValueError("Missing hub_model_id in the configuration.")
return f"{cfg.hub_model_id}-GPTQ"

def get_quantized_model_dir(cfg: DictDefault):
# def get_quantized_model_dir(cfg: DictDefault, quantize_config):
if not cfg.output_dir:
raise ValueError("Missing output_dir in the configuration.")
return f"{cfg.output_dir.lstrip('./')}-GPTQ"

def main():
print("Starting...")
# return
# prompt = "<|prompt|>How can entrepreneurs start building their own communities even before launching their product?</s><|answer|>"

should_quantize = False
should_quantize = True
# tokenizer = get_tokenizer()

cfg = load_cfg(config_path)
Expand All @@ -234,13 +72,13 @@ def main():
datasets = load_datasets(cfg=cfg, cli_args=TrainerCliArgs())
train_dataset = datasets.train_dataset
n_samples = 128
# n_samples = 2
examples = train_dataset.shuffle(seed=42).select(
[
random.randrange(0, len(train_dataset) - 1) # nosec
for _ in range(n_samples)
]
)
# # n_samples = 2
# examples = train_dataset.shuffle(seed=42).select(
# [
# random.randrange(0, len(train_dataset) - 1) # nosec
# for _ in range(n_samples)
# ]
# )

LOG.info("loading model and (optionally) peft_config...")
# model, peft_config = load_model(cfg, tokenizer, inference=True)
Expand All @@ -250,11 +88,12 @@ def main():
# examples = load_data(dataset_name, tokenizer, n_samples)

# print(examples)
examples_for_quant = [
{"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
for example in examples
]
# examples_for_quant = [
# {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
# for example in examples
# ]
# print(examples_for_quant)
examples_for_quant = get_examples_for_quantization(train_dataset, n_samples)

modelq = quantize_and_save(cfg, model, tokenizer, examples_for_quant)
else:
Expand All @@ -263,7 +102,8 @@ def main():

push_model(cfg, modelq, tokenizer)

main()
if __name__ == "__main__":
main()


# Load configure
Expand Down
Loading

0 comments on commit 19a30cf

Please sign in to comment.