diff --git a/language/bert/qlv4_save.py b/language/bert/qlv4_save.py new file mode 100644 index 000000000..0ecec4da7 --- /dev/null +++ b/language/bert/qlv4_save.py @@ -0,0 +1,187 @@ +# coding=utf-8 +# Copyright 2021 Arm Limited and affiliates. +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess +import sys +import torch + +import mlperf_loadgen as lg + +sys.path.insert(0, os.getcwd()) +sys.path.insert(0, os.path.join(os.getcwd(), "..", "..", "lon")) +import model_compressor +from absl import app, flags + +from quantization import get_quant_model +from utils import random_seed, set_optimization + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--backend", + choices=["tf", "pytorch", "onnxruntime", "tf_estimator", "ray"], + default="tf", + help="Backend", + ) + parser.add_argument( + "--scenario", + choices=["SingleStream", "Offline", "Server", "MultiStream"], + default="Offline", + help="Scenario", + ) + parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") + parser.add_argument( + "--quantized", + action="store_true", + help="use quantized model (only valid for onnxruntime backend)", + ) + parser.add_argument( + "--profile", + action="store_true", + help="enable profiling (only valid for onnxruntime backend)", + ) + parser.add_argument( + "--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config" + ) + parser.add_argument( + "--user_conf", + default="user.conf", + help="user config for user LoadGen settings such as target QPS", + ) + parser.add_argument( + "--audit_conf", + default="audit.conf", + help="audit config for LoadGen settings during compliance runs", + ) + parser.add_argument( + "--max_examples", + type=int, + help="Maximum number of examples to consider (not limited by default)", + ) + parser.add_argument( + "--network", + choices=["sut", "lon", None], + default=None, + help="Loadgen network mode", + ) + parser.add_argument("--node", type=str, default="") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--sut_server", + nargs="*", + default=["http://localhost:8000"], + help="Address of the server(s) under test.", + ) + parser.add_argument( + "--model_script_path", + default="./quantization/model_script/Qlevel4_RGDA0-W8A8KV8-PTQ.yaml", + help="", + ) + parser.add_argument( + "--use_mcp", action="store_true", help="use mcp to quantize the model" + ) + parser.add_argument("--n_calib", type=int, default=-1) + parser.add_argument( + "--torch_optim", + default="none", + type=str, + choices=["default", "none"], + help="Torch optimization.", + ) + parser.add_argument( + "--n_layers", + default="-1", + type=int, + help="set the number of layers.", + ) + parser.add_argument( + "--model_source", + default="mlperf_submission", + type=str, + choices=[ + "huggingface_rngd_gelu", + "mlperf_submission", + "experimental_huggingface_unsplit_packed", + ], + help="choose model source", + ) + parser.add_argument( + "--output_path", + default='./quantization/output', + help="", + ) + + args = parser.parse_args() + return args + + +scenario_map = { + "SingleStream": lg.TestScenario.SingleStream, + "Offline": lg.TestScenario.Offline, + "Server": lg.TestScenario.Server, + "MultiStream": lg.TestScenario.MultiStream, +} + + +def qlv4_save(): + # --------------------------------------------------------- + # Setting for ci test + # --------------------------------------------------------- + args = get_args() + set_optimization(args) + random_seed() + + sut = None + args.backend = "pytorch" + args.max_examples = 1 + args.recalibrate = True + args.use_mcp = True + args.accuracy = True + args.torch_optim = "none" + args.model_script_path = ( + "./quantization/model_script/Qlevel4_RGDA0-W8A8KV8-PTQ.yaml" + ) + + from pytorch_SUT import get_pytorch_sut + + # --------------------------------------------------------- + # get model + # --------------------------------------------------------- + sut = get_pytorch_sut(args) + sut.model = get_quant_model( + sut, + args.model_source, + args.model_script_path, + args.n_calib, + False, + output_path=args.output_path, + ) + + if args.model_source =="mlperf_submission": + model = sut.model.model + else: + model= sut.model + + torch.save(model.state_dict(), args.output_path + '/qlv4.bin') + print("qlv4 model is saved well") + + +if __name__ == "__main__": + qlv4_save() diff --git a/language/bert/quantization/get_quant_model.py b/language/bert/quantization/get_quant_model.py index d815cc4c9..d8bc7eeaf 100644 --- a/language/bert/quantization/get_quant_model.py +++ b/language/bert/quantization/get_quant_model.py @@ -124,20 +124,8 @@ def get_quant_model(sut, model_source, model_script_path, n_calib, recalibrate, model, qformat_path=qformat_path, qparam_path=qparam_path, - weight_calib_method=model_script["weight_calib_method"], - weight_granularity=model_script["weight_granularity"], - weight_dtype=model_script["weight_dtype"], - weight_nbits=model_script["weight_nbits"], - act_calib_method=model_script["act_calib_method"], - act_granularity=model_script["act_granularity"], - act_dtype=model_script["act_dtype"], - act_nbits=model_script["act_nbits"], - kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16', qlevel=model_script["qlevel"], - act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'), target_machine=model_script["target_machine"], - dataloader=None, - disable_inout=(True,True), ) if model_source == 'mlperf_submission' or model_source == 'compact_causal_mask': diff --git a/language/gpt-j/qlv4_save.py b/language/gpt-j/qlv4_save.py new file mode 100644 index 000000000..eb00e776d --- /dev/null +++ b/language/gpt-j/qlv4_save.py @@ -0,0 +1,62 @@ +import yaml +from transformers import AutoConfig +import torch +import json +import quantization +import model_compressor +import joblib +import argparse + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", default="./model/", help="") + parser.add_argument("--model_config", default="./ci_test_file/config.json", help="") + parser.add_argument("--model_script_path", default="./quantization/model_script/Qlevel4_RGDA0-W8A8KV8-PTQ-SMQ-rope_lm-headint8.yaml", help="") + parser.add_argument("--model_source", type = str, default = "mlperf_submission", help="the type of GPTJForCausalLM to use") + parser.add_argument('--qformat_path', type = str, default="./quantization/output/qformat_Qlevel4_RGDA0-W8A8KV8-PTQ-SMQ-mlperf_submission.yaml", help="") + parser.add_argument('--qparam_path', type = str, default="./quantization/output/qparam_Qlevel4_RGDA0-W8A8KV8-PTQ-SMQ-mlperf_submission.npy", help="") + parser.add_argument('--qlv4_prefill_out_path', type = str, default='./quantization/model_script/prefill.bin', help="") + parser.add_argument('--qlv4_decode_out_path', type = str, default='./quantization/model_script/decode.bin', help="") + args = parser.parse_args() + return args + + +#load model_script +def save_qlv4_model(): + args = get_args() + torch_device_type = 'cuda' if torch.cuda.is_available() else 'cpu' + device = torch.device(torch_device_type) + + + + ###hyperparameter### + if args.model_source == "furiosa_llm_rope_rngd_gelu": + from furiosa_llm_models.gptj.symbolic.huggingface_rope_rngd_gelu import GPTJForCausalLM + elif args.model_source == "mlperf_submission": + from furiosa_llm_models.gptj.symbolic.mlperf_submission import GPTJForCausalLM + else: + raise ValueError("other models are not considered.") + config = AutoConfig.from_pretrained(args.model_config) + model = GPTJForCausalLM.from_pretrained(args.model_path, config=config).to(device) + + model_generator = quantization.get_quant_model(model = model, + calib_dataset_path = None, + model_script_path = args.model_script_path, + calib_without_padding = False, + recalibrate = False, + qformat_path = args.qformat_path, + qparam_path = args.qparam_path) + + if args.model_source == "furiosa_llm_rope_rngd_gelu": + torch.save(model_generator.prefill_model.state_dict(), args.qlv4_prefill_out_path) + torch.save(model_generator.decode_model.state_dict(), args.qlv4_decode_out_path) + elif args.model_source == "mlperf_submission": + torch.save(model_generator.prefill.state_dict(), args.qlv4_prefill_out_path) + torch.save(model_generator.decode.state_dict(), args.qlv4_decode_out_path) + + print("success save qlv4 state dict") + + + +if __name__ == "__main__": + save_qlv4_model() diff --git a/language/gpt-j/quantization/get_quant_model.py b/language/gpt-j/quantization/get_quant_model.py index c0956584c..dc09f18f0 100644 --- a/language/gpt-j/quantization/get_quant_model.py +++ b/language/gpt-j/quantization/get_quant_model.py @@ -84,7 +84,7 @@ class dotdict(dict): -def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_padding, recalibrate, qformat_path = None, qparam_path = None, immigrate_qparams = False,): +def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_padding, recalibrate, qformat_path = None, qparam_path = None, immigrate_qparams = False): # Load model script and calibration dataloader (Refer to inference-compression/language/gpt-j/README.md on how to download evaluation and calibration dataset ) model_script = load_model_script(model_script_path) @@ -192,20 +192,8 @@ def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_ decode_model, qformat_path = qformat_path, qparam_path = qparam_path, - weight_calib_method=model_script["weight_calib_method"], - weight_granularity=model_script["weight_granularity"], - weight_dtype=model_script["weight_dtype"], - weight_nbits=model_script["weight_nbits"], - act_calib_method=model_script["act_calib_method"], - act_granularity=model_script["act_granularity"], - act_dtype=model_script["act_dtype"], - act_nbits=model_script["act_nbits"], qlevel=model_script["qlevel"], target_machine=model_script["target_machine"], - act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'), - dataloader=None, - disable_inout=(True, False), - kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16', delete_org_weight=True, ) generator = FURIOSA_GENERATOR_DICT[model_type] @@ -230,20 +218,8 @@ def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_ traced_models["prefill"], qformat_path = qformat_path, qparam_path = qparam_path, - weight_calib_method=model_script["weight_calib_method"], - weight_granularity=model_script["weight_granularity"], - weight_dtype=model_script["weight_dtype"], - weight_nbits=model_script["weight_nbits"], - act_calib_method=model_script["act_calib_method"], - act_granularity=model_script["act_granularity"], - act_dtype=model_script["act_dtype"], - act_nbits=model_script["act_nbits"], qlevel=model_script["qlevel"], target_machine=model_script["target_machine"], - act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'), - dataloader=None, - disable_inout=(True, False), - kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16', # decode_phase = True, delete_org_weight=True, immigrate_qparams = immigrate_qparams, @@ -253,20 +229,8 @@ def get_quant_model(model, calib_dataset_path, model_script_path, calib_without_ traced_models["decode"], qformat_path = qformat_path, qparam_path = qparam_path, - weight_calib_method=model_script["weight_calib_method"], - weight_granularity=model_script["weight_granularity"], - weight_dtype=model_script["weight_dtype"], - weight_nbits=model_script["weight_nbits"], - act_calib_method=model_script["act_calib_method"], - act_granularity=model_script["act_granularity"], - act_dtype=model_script["act_dtype"], - act_nbits=model_script["act_nbits"], qlevel=model_script["qlevel"], target_machine=model_script["target_machine"], - act_zp_equalizing=(model_script["act_zp_equalizing"] if model_script["act_zp_equalizing"] else 'disabled'), - dataloader=None, - disable_inout=(True, False), - kv_dtype = model_script["kv_dtype"] if "kv_dtype" in model_script else 'bf16', decode_phase = True, delete_org_weight=True, quantized_prefill_model=prefill_model,