diff --git a/llm_bench/python/utils/conversion_utils/helpers.py b/llm_bench/python/utils/conversion_utils/helpers.py index 2c7508b6d4..09693a3c3f 100644 --- a/llm_bench/python/utils/conversion_utils/helpers.py +++ b/llm_bench/python/utils/conversion_utils/helpers.py @@ -13,8 +13,8 @@ from nncf import Dataset from openvino import save_model import nncf -from ..nncf_utils import COMPRESSION_OPTIONS, INT4_MODEL_CONFIGURATION -from optimum.intel.openvino.configuration import _check_default_4bit_configs +from ..nncf_utils import COMPRESSION_OPTIONS +from optimum.intel.openvino.configuration import get_default_int4_config, _DEFAULT_4BIT_CONFIG, OVQuantizationMethod import warnings @@ -157,7 +157,7 @@ def get_data_aware_args(ov_model, tokenizer, config, compression_args, args): dataset_args = compression_args['dataset'] dataset_params = dataset_args['name'] if 'sensitivity_metric' in dataset_args: - res['mode'] = dataset_args['sensitivity_metric'] + res['sensitivity_metric'] = dataset_args['sensitivity_metric'] if 'awq' in dataset_args: res['awq'] = dataset_args['awq'] if 'scale_estimation' in dataset_args: @@ -172,7 +172,7 @@ def get_data_aware_args(ov_model, tokenizer, config, compression_args, args): if dataset_params is not None: # for example "wikitext,wikitext-2-v1,train[:1000],text" path, name, split, item_name = dataset_params.split(',') - dataset = load_dataset(path, name, split=split) + dataset = load_dataset(path, name, split=split, streaming="allenai/c4" in path) if path == 'wikitext': # filter short sentences @@ -189,17 +189,37 @@ def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_w if "INT8" in compress_weights_format and "INT8_ASYM" in COMPRESSION_OPTIONS: warnings.warn("Usage INT8 mode is deprecated and will be removed soon. Please use INT8_ASYM instead", DeprecationWarning) if "4BIT_DEFAULT" in compress_weights_format: - compression_args = _check_default_4bit_configs(config) - if compression_args: - sym = compression_args.pop("sym", False) - compression_args.pop("bits", 4) - compression_args["mode"] = nncf.CompressWeightsMode.INT4_SYM if sym else nncf.CompressWeightsMode.INT4_ASYM - if compression_args is None: - model_id = out_path.parents[3].name - if model_id in INT4_MODEL_CONFIGURATION: - compression_args = INT4_MODEL_CONFIGURATION[model_id] + compression_args = get_default_int4_config(config.name_or_path) + compression_args.pop("bits") + + sym = compression_args.pop("sym", _DEFAULT_4BIT_CONFIG["sym"]) + compression_args["mode"] = nncf.CompressWeightsMode.INT4_SYM if sym else nncf.CompressWeightsMode.INT4_ASYM + + quant_method = compression_args.pop("quant_method", None) + scale_estimation = compression_args.pop("scale_estimation", False) + sensitivity_metric = compression_args.pop("sensitivity_metric", None) + num_samples = compression_args.pop("num_samples", None) + if num_samples: + compression_args["subset_size"] = num_samples + dataset = compression_args.pop("dataset", None) + if dataset: + if dataset == "wikitext2": + dataset_name = "wikitext,wikitext-2-v1,train[:1000],text" + elif dataset == "c4" or dataset == "c4-new": + dataset_name = "allenai/c4,en,train,text" else: - compression_args = COMPRESSION_OPTIONS["INT4_ASYM"] + raise ValueError(f"Unrecognized dataset: {dataset}") + + dataset_args = {"name": dataset_name} + if quant_method == OVQuantizationMethod.AWQ: + dataset_args["awq"] = True + elif quant_method is not None: + raise ValueError(f"Unrecognised quant_method: {quant_method}") + if scale_estimation: + dataset_args["scale_estimation"] = True + if sensitivity_metric: + dataset_args["sensitivity_metric"] = sensitivity_metric + compression_args["dataset"] = dataset_args if compression_args is None: compression_args = COMPRESSION_OPTIONS[compress_weights_format] diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py index 01d0dd95b3..b65e90a3a9 100644 --- a/llm_bench/python/utils/nncf_utils.py +++ b/llm_bench/python/utils/nncf_utils.py @@ -35,45 +35,3 @@ def get_compressed_path(output_dir: str, base_precision, option: str): return Path(output_dir) / "pytorch/dldt/compressed_weights" / f"OV_{base_precision}-{option}" - - -INT4_MODEL_CONFIGURATION = { - "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, - "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64}, - "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8}, - "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128}, - "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8, - "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, - "llama-2-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6}, - "llama-2-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8}, - "llama-2-13b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8}, - "stablelm-3b-4e1t": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8, - "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, - "stablelm-epoch-3b-preview": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8, - "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, - "stable-zephyr-3b-dpo": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, - "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, - "stable-code-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8}, - "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8}, - "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72}, - "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6}, - "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True}, - "falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True}, - "orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True, - "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": False}}, - "bloomz-560m": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8, - "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, - "mixtral-8x7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8}, - "baichuan2-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8, - "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, - "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9}, - "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7}, - "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7}, - "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, - "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0}, - "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, - "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9}, - "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9}, - "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, - "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9}, -}