Skip to content

Commit

Permalink
Merge pull request #80 from furiosa-ai/save_qlv4
Browse files Browse the repository at this point in the history
Save qlv4
  • Loading branch information
Mincho0102 authored Jul 5, 2024
2 parents 7f01b47 + a967d70 commit 5c89e8c
Show file tree
Hide file tree
Showing 4 changed files with 250 additions and 49 deletions.
187 changes: 187 additions & 0 deletions language/bert/qlv4_save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# coding=utf-8
# Copyright 2021 Arm Limited and affiliates.
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import subprocess
import sys
import torch

import mlperf_loadgen as lg

sys.path.insert(0, os.getcwd())
sys.path.insert(0, os.path.join(os.getcwd(), "..", "..", "lon"))
import model_compressor
from absl import app, flags

from quantization import get_quant_model
from utils import random_seed, set_optimization


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--backend",
choices=["tf", "pytorch", "onnxruntime", "tf_estimator", "ray"],
default="tf",
help="Backend",
)
parser.add_argument(
"--scenario",
choices=["SingleStream", "Offline", "Server", "MultiStream"],
default="Offline",
help="Scenario",
)
parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
parser.add_argument(
"--quantized",
action="store_true",
help="use quantized model (only valid for onnxruntime backend)",
)
parser.add_argument(
"--profile",
action="store_true",
help="enable profiling (only valid for onnxruntime backend)",
)
parser.add_argument(
"--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config"
)
parser.add_argument(
"--user_conf",
default="user.conf",
help="user config for user LoadGen settings such as target QPS",
)
parser.add_argument(
"--audit_conf",
default="audit.conf",
help="audit config for LoadGen settings during compliance runs",
)
parser.add_argument(
"--max_examples",
type=int,
help="Maximum number of examples to consider (not limited by default)",
)
parser.add_argument(
"--network",
choices=["sut", "lon", None],
default=None,
help="Loadgen network mode",
)
parser.add_argument("--node", type=str, default="")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--sut_server",
nargs="*",
default=["http://localhost:8000"],
help="Address of the server(s) under test.",
)
parser.add_argument(
"--model_script_path",
default="./quantization/model_script/Qlevel4_RGDA0-W8A8KV8-PTQ.yaml",
help="",
)
parser.add_argument(
"--use_mcp", action="store_true", help="use mcp to quantize the model"
)
parser.add_argument("--n_calib", type=int, default=-1)
parser.add_argument(
"--torch_optim",
default="none",
type=str,
choices=["default", "none"],
help="Torch optimization.",
)
parser.add_argument(
"--n_layers",
default="-1",
type=int,
help="set the number of layers.",
)
parser.add_argument(
"--model_source",
default="mlperf_submission",
type=str,
choices=[
"huggingface_rngd_gelu",
"mlperf_submission",
"experimental_huggingface_unsplit_packed",
],
help="choose model source",
)
parser.add_argument(
"--output_path",
default='./quantization/output',
help="",
)

args = parser.parse_args()
return args


scenario_map = {
"SingleStream": lg.TestScenario.SingleStream,
"Offline": lg.TestScenario.Offline,
"Server": lg.TestScenario.Server,
"MultiStream": lg.TestScenario.MultiStream,
}


def qlv4_save():
# ---------------------------------------------------------
# Setting for ci test
# ---------------------------------------------------------
args = get_args()
set_optimization(args)
random_seed()

sut = None
args.backend = "pytorch"
args.max_examples = 1
args.recalibrate = True
args.use_mcp = True
args.accuracy = True
args.torch_optim = "none"
args.model_script_path = (
"./quantization/model_script/Qlevel4_RGDA0-W8A8KV8-PTQ.yaml"
)

from pytorch_SUT import get_pytorch_sut

# ---------------------------------------------------------
# get model
# ---------------------------------------------------------
sut = get_pytorch_sut(args)
sut.model = get_quant_model(
sut,
args.model_source,
args.model_script_path,
args.n_calib,
False,
output_path=args.output_path,
)

if args.model_source =="mlperf_submission":
model = sut.model.model
else:
model= sut.model

torch.save(model.state_dict(), args.output_path + '/qlv4.bin')
print("qlv4 model is saved well")


if __name__ == "__main__":
qlv4_save()
12 changes: 0 additions & 12 deletions language/bert/quantization/get_quant_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,20 +124,8 @@ def get_quant_model(sut, model_source, model_script_path, n_calib, recalibrate,
model,
qformat_path=qformat_path,
qparam_path=qparam_path,
weight_calib_method=model_script["weight_calib_method"],
weight_granularity=model_script["weight_granularity"],
weight_dtype=model_script["weight_dtype"],
weight_nbits=model_script["weight_nbits"],
act_calib_method=model_script["act_calib_method"],
act_granularity=model_script["act_granularity"],
act_dtype=model_script["act_dtype"],
act_nbits=model_script["act_nbits"],
kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16',
qlevel=model_script["qlevel"],
act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'),
target_machine=model_script["target_machine"],
dataloader=None,
disable_inout=(True,True),
)

if model_source == 'mlperf_submission' or model_source == 'compact_causal_mask':
Expand Down
62 changes: 62 additions & 0 deletions language/gpt-j/qlv4_save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import yaml
from transformers import AutoConfig
import torch
import json
import quantization
import model_compressor
import joblib
import argparse

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", default="./model/", help="")
parser.add_argument("--model_config", default="./ci_test_file/config.json", help="")
parser.add_argument("--model_script_path", default="./quantization/model_script/Qlevel4_RGDA0-W8A8KV8-PTQ-SMQ-rope_lm-headint8.yaml", help="")
parser.add_argument("--model_source", type = str, default = "mlperf_submission", help="the type of GPTJForCausalLM to use")
parser.add_argument('--qformat_path', type = str, default="./quantization/output/qformat_Qlevel4_RGDA0-W8A8KV8-PTQ-SMQ-mlperf_submission.yaml", help="")
parser.add_argument('--qparam_path', type = str, default="./quantization/output/qparam_Qlevel4_RGDA0-W8A8KV8-PTQ-SMQ-mlperf_submission.npy", help="")
parser.add_argument('--qlv4_prefill_out_path', type = str, default='./quantization/model_script/prefill.bin', help="")
parser.add_argument('--qlv4_decode_out_path', type = str, default='./quantization/model_script/decode.bin', help="")
args = parser.parse_args()
return args


#load model_script
def save_qlv4_model():
args = get_args()
torch_device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(torch_device_type)



###hyperparameter###
if args.model_source == "furiosa_llm_rope_rngd_gelu":
from furiosa_llm_models.gptj.symbolic.huggingface_rope_rngd_gelu import GPTJForCausalLM
elif args.model_source == "mlperf_submission":
from furiosa_llm_models.gptj.symbolic.mlperf_submission import GPTJForCausalLM
else:
raise ValueError("other models are not considered.")
config = AutoConfig.from_pretrained(args.model_config)
model = GPTJForCausalLM.from_pretrained(args.model_path, config=config).to(device)

model_generator = quantization.get_quant_model(model = model,
calib_dataset_path = None,
model_script_path = args.model_script_path,
calib_without_padding = False,
recalibrate = False,
qformat_path = args.qformat_path,
qparam_path = args.qparam_path)

if args.model_source == "furiosa_llm_rope_rngd_gelu":
torch.save(model_generator.prefill_model.state_dict(), args.qlv4_prefill_out_path)
torch.save(model_generator.decode_model.state_dict(), args.qlv4_decode_out_path)
elif args.model_source == "mlperf_submission":
torch.save(model_generator.prefill.state_dict(), args.qlv4_prefill_out_path)
torch.save(model_generator.decode.state_dict(), args.qlv4_decode_out_path)

print("success save qlv4 state dict")



if __name__ == "__main__":
save_qlv4_model()
38 changes: 1 addition & 37 deletions language/gpt-j/quantization/get_quant_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class dotdict(dict):



def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_padding, recalibrate, qformat_path = None, qparam_path = None, immigrate_qparams = False,):
def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_padding, recalibrate, qformat_path = None, qparam_path = None, immigrate_qparams = False):
# Load model script and calibration dataloader (Refer to inference-compression/language/gpt-j/README.md on how to download evaluation and calibration dataset )
model_script = load_model_script(model_script_path)

Expand Down Expand Up @@ -192,20 +192,8 @@ def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_
decode_model,
qformat_path = qformat_path,
qparam_path = qparam_path,
weight_calib_method=model_script["weight_calib_method"],
weight_granularity=model_script["weight_granularity"],
weight_dtype=model_script["weight_dtype"],
weight_nbits=model_script["weight_nbits"],
act_calib_method=model_script["act_calib_method"],
act_granularity=model_script["act_granularity"],
act_dtype=model_script["act_dtype"],
act_nbits=model_script["act_nbits"],
qlevel=model_script["qlevel"],
target_machine=model_script["target_machine"],
act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'),
dataloader=None,
disable_inout=(True, False),
kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16',
delete_org_weight=True,
)
generator = FURIOSA_GENERATOR_DICT[model_type]
Expand All @@ -230,20 +218,8 @@ def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_
traced_models["prefill"],
qformat_path = qformat_path,
qparam_path = qparam_path,
weight_calib_method=model_script["weight_calib_method"],
weight_granularity=model_script["weight_granularity"],
weight_dtype=model_script["weight_dtype"],
weight_nbits=model_script["weight_nbits"],
act_calib_method=model_script["act_calib_method"],
act_granularity=model_script["act_granularity"],
act_dtype=model_script["act_dtype"],
act_nbits=model_script["act_nbits"],
qlevel=model_script["qlevel"],
target_machine=model_script["target_machine"],
act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'),
dataloader=None,
disable_inout=(True, False),
kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16',
# decode_phase = True,
delete_org_weight=True,
immigrate_qparams = immigrate_qparams,
Expand All @@ -253,20 +229,8 @@ def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_
traced_models["decode"],
qformat_path = qformat_path,
qparam_path = qparam_path,
weight_calib_method=model_script["weight_calib_method"],
weight_granularity=model_script["weight_granularity"],
weight_dtype=model_script["weight_dtype"],
weight_nbits=model_script["weight_nbits"],
act_calib_method=model_script["act_calib_method"],
act_granularity=model_script["act_granularity"],
act_dtype=model_script["act_dtype"],
act_nbits=model_script["act_nbits"],
qlevel=model_script["qlevel"],
target_machine=model_script["target_machine"],
act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'),
dataloader=None,
disable_inout=(True, False),
kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16',
decode_phase = True,
delete_org_weight=True,
quantized_prefill_model=prefill_model,
Expand Down

0 comments on commit 5c89e8c

Please sign in to comment.