Transition to default int4 compression configs from optimum-intel (#689)

**Changes** - For `4BIT_DEFAULT`, the compression config is now always imported from optimum-intel - Remove all default configs from openvino.genai, rely only on default configs in optimum-intel - Adopt dataset preparation logic from optimum-intel Breaking changes: - Old compression dataset specification is deprecated. For example `"wikitext,wikitext-2-v1,train[:1000],text"` will not work anymore. A string `"wikitext2"` should be used instead. - Compression with data-aware methods will produce different results, potentially worse. - Default int4 configs will no longer be matched by folder names. To match the default config, one of these options must hold: - model is exported with `python convert.py -m <model_id>` - model is exported with `python convert.py -m <model_path>`, where `model_path` contains a `config.json` file with a `_name_or_path` field containing a model id - model is exported with `python convert.py -m <model_path>`, where `model_path` contains a `config.json` file with a `_name_or_path` field containing another `model_path_2`, such that the condition from the bullet point above holds for it **Related tickets** 147470
openvinotoolkit · Aug 26, 2024 · d6bd822 · d6bd822
1 parent 779c298
commit d6bd822
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 128 deletions.
diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py
@@ -1446,10 +1446,7 @@ def main():
     compression_group.add_argument(
         "--dataset",
         help=(
-            "Dataset parameters for data-aware compression in format path,name,split,item_name "
-            "(for example \"wikitext,wikitext-2-v1,train[:1000],text\") "
-            "path,name,split - parameters for load_dataset from datasets "
-            "and item_name is field name in dataset with text."
+            "Dataset name for data-aware compression. Must be one of ['wikitext2', 'c4', 'c4-new']."
         ),
         default=None,
         type=str,

diff --git a/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py b/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py
@@ -1,20 +1,22 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
+import copy
+import json
 from enum import Enum
-from functools import partial
 import logging as log
 from pathlib import Path
-from datasets import load_dataset
+from typing import Optional, List, Dict
+
 import torch
 import numpy as np
 from nncf import compress_weights
 from nncf import Dataset
 from openvino import save_model
 import nncf
-from ..nncf_utils import COMPRESSION_OPTIONS, INT4_MODEL_CONFIGURATION
-from optimum.intel.openvino.configuration import _check_default_4bit_configs
+from ..nncf_utils import COMPRESSION_OPTIONS
+from optimum.gptq.data import get_dataset, prepare_dataset
+from optimum.intel.openvino.configuration import _check_default_4bit_configs, OVQuantizationMethod, _DEFAULT_4BIT_CONFIG
 import warnings
 
 
@@ -94,27 +96,31 @@ def save_tokenizer(tokenizer, out_dir):
         log.error(f'tokenizer loading failed with {e}')
 
 
-def transform_fn(item, item_name, input_shapes, tokenizer, config, max_tokens=127):
-    tokenized_text = tokenizer(item[item_name], return_tensors="np")
-    input_ids = tokenized_text["input_ids"][:max_tokens]
-    attention_mask = tokenized_text["attention_mask"][:max_tokens]
-
-    inputs = {}
-    inputs["input_ids"] = input_ids
+def transform_fn(
+    config,
+    input_shapes: Dict[str, List],
+    input_ids: torch.LongTensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    **kwargs
+):
+    inputs = {"input_ids": np.array(input_ids)}
 
     if "attention_mask" in input_shapes:
-        inputs["attention_mask"] = tokenized_text["attention_mask"]
+        inputs["attention_mask"] = attention_mask
 
     if "position_ids" in input_shapes:
-        position_ids = np.cumsum(attention_mask, axis=1) - 1
-        position_ids[attention_mask == 0] = 1
+        if position_ids is None:
+            position_ids = np.cumsum(attention_mask, axis=1) - 1
+            position_ids[attention_mask == 0] = 1
+        else:
+            position_ids = np.array(position_ids)
         inputs["position_ids"] = position_ids
 
-    batch_size = input_ids.shape[0]
-    if config.model_type == "bloom":
-        batch_size *= config.num_attention_heads
-
     if "beam_idx" in input_shapes:
+        batch_size = input_ids.shape[0]
+        if config.model_type == "bloom":
+            batch_size *= config.num_attention_heads
         inputs["beam_idx"] = np.arange(batch_size, dtype=int)
 
     for name, shape in input_shapes.items():
@@ -136,86 +142,68 @@ def get_ov_input_shapes(model, batch_size=1):
     return inputs
 
 
-def get_data_aware_args(ov_model, tokenizer, config, compression_args, args):
+def get_nncf_dataset(ov_model, tokenizer, config, dataset_name, subset_size):
     """initializes dict with data-aware compression parameters if defined dataset and tokenizer
 
     Args:
         ov_model : OpenVINO model for compression
         tokenizer : tokenizer for ov_model
         config : ov_model configuration
-        compression_args: compression arguments from model compression configuration
-        args : CLI args
+        dataset_name: name of the dataset to load; must be one of ['wikitext2', 'c4', 'c4-new']
+        subset_size: the number of sample the dataset should contain
 
     Returns:
-        res: dict with data-aware compression parameters
+        nncf_dataset: NNCF dataset
     """
-    res = {}
-    if tokenizer is None:
-        return res
-    dataset_params = None
-    if 'dataset' in compression_args:
-        dataset_args = compression_args['dataset']
-        dataset_params = dataset_args['name']
-        if 'sensitivity_metric' in dataset_args:
-            res['mode'] = dataset_args['sensitivity_metric']
-        if 'awq' in dataset_args:
-            res['awq'] = dataset_args['awq']
-        if 'scale_estimation' in dataset_args:
-            res['scale_estimation'] = dataset_args['scale_estimation']
-    elif args.dataset is not None:
-        dataset_params = args.dataset
-        if args.awq:
-            res['awq'] = args.awq
-        if args.scale_estimation:
-            res['scale_estimation'] = args.scale_estimation
-
-    if dataset_params is not None:
-        # for example "wikitext,wikitext-2-v1,train[:1000],text"
-        path, name, split, item_name = dataset_params.split(',')
-        dataset = load_dataset(path, name, split=split)
-
-        if path == 'wikitext':
-            # filter short sentences
-            dataset = dataset.filter(lambda example: len(example["text"]) > 128)
-        input_shapes = get_ov_input_shapes(ov_model)
-        data_transform_func = partial(transform_fn, item_name=item_name, tokenizer=tokenizer, input_shapes=input_shapes, config=config)
-        nncf_dataset = Dataset(dataset, data_transform_func)
-        res['dataset'] = nncf_dataset
-    return res
+    subset_size = subset_size or 128
+    dataset = get_dataset(dataset_name, tokenizer, seqlen=32, nsamples=subset_size)
+    dataset = prepare_dataset(dataset)
+    input_shapes = get_ov_input_shapes(ov_model)
+    nncf_dataset = Dataset(dataset, lambda x: transform_fn(config=config, input_shapes=input_shapes, **x))
+    return nncf_dataset
 
 
 def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_weights_format="INT8", fp16=False, args={}, model_name="openvino_model"):
-    compression_args = None
     if "INT8" in compress_weights_format and "INT8_ASYM" in COMPRESSION_OPTIONS:
         warnings.warn("Usage INT8 mode is deprecated and will be removed soon. Please use INT8_ASYM instead", DeprecationWarning)
     if "4BIT_DEFAULT" in compress_weights_format:
-        try:
-            # TODO: remove this path when support of an older version optimum-intel is deprecated
-            compression_args = _check_default_4bit_configs(config)
-        except TypeError:
-            compression_args = _check_default_4bit_configs(config.name_or_path)
-        if compression_args:
-            sym = compression_args.pop("sym", False)
-            compression_args.pop("bits", 4)
-            compression_args["mode"] = nncf.CompressWeightsMode.INT4_SYM if sym else nncf.CompressWeightsMode.INT4_ASYM
+        compression_args = _check_default_4bit_configs(config.name_or_path)
         if compression_args is None:
-            model_id = out_path.parents[3].name
-            if model_id in INT4_MODEL_CONFIGURATION:
-                compression_args = INT4_MODEL_CONFIGURATION[model_id]
-            else:
-                compression_args = COMPRESSION_OPTIONS["INT4_ASYM"]
-
-    if compression_args is None:
-        compression_args = COMPRESSION_OPTIONS[compress_weights_format]
-        if args.ratio is not None:
-            compression_args["ratio"] = args.ratio
-        if args.group_size is not None:
-            compression_args["group_size"] = args.group_size
-    if args.all_layers:
-        compression_args["all_layers"] = True
+            config_path = Path(config.name_or_path) / "config.json"
+            if config_path.exists():
+                with config_path.open("r") as f:
+                    json_config = json.load(f)
+                name_or_path = json_config.get("_name_or_path", None)
+                if name_or_path is not None:
+                    # Do additional check in case the input model is a full precision IR exported from PT model by path
+                    compression_args = _check_default_4bit_configs(name_or_path)
+        compression_args = compression_args or _DEFAULT_4BIT_CONFIG
+        compression_args = copy.deepcopy(compression_args)
+        compression_args.pop("bits")
+
+        sym = compression_args.pop("sym", False)
+        compression_args["mode"] = nncf.CompressWeightsMode.INT4_SYM if sym else nncf.CompressWeightsMode.INT4_ASYM
+        if compression_args.pop("quant_method", None) == OVQuantizationMethod.AWQ:
+            compression_args["awq"] = True
+        if "num_samples" in compression_args:
+            compression_args["subset_size"] = compression_args.pop("num_samples")
+        if not compression_args.get("all_layers", None):
+            compression_args.pop("all_layers", None)
+    else:
+        compression_args = copy.deepcopy(COMPRESSION_OPTIONS[compress_weights_format])
+        for arg_name in ["ratio", "group_size", "all_layers", "dataset", "awq", "scale_estimation"]:
+            arg_value = getattr(args, arg_name, None)
+            if arg_value:
+                compression_args[arg_name] = arg_value
+
     log.info("Compression options:")
     log.info(compression_args)
-    compression_args.update(get_data_aware_args(ov_model, tok, config, compression_args, args))
+
+    dataset_name = compression_args.pop("dataset", None)
+    if dataset_name is not None and tok is not None:
+        nncf_dataset = get_nncf_dataset(ov_model, tok, config, dataset_name, compression_args.get("subset_size", None))
+        compression_args["dataset"] = nncf_dataset
+
     compressed_ov_model = compress_weights(ov_model, **compression_args)
     save_ov_model_helper(compressed_ov_model, out_path, model_name, fp16=fp16, tok=tok, config=config)
 

diff --git a/llm_bench/python/llm_bench_utils/nncf_utils.py b/llm_bench/python/llm_bench_utils/nncf_utils.py
@@ -35,46 +35,3 @@
 
 def get_compressed_path(output_dir: str, base_precision, option: str):
     return Path(output_dir) / "pytorch/dldt/compressed_weights" / f"OV_{base_precision}-{option}"
-
-
-INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
-    "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
-    "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
-    "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
-    "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8,
-                       "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
-    "llama-2-7b-hf": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
-    "llama-2-7b-chat-hf": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
-    "llama-2-13b-chat-hf": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
-    "stablelm-3b-4e1t": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8,
-                         "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
-    "stablelm-epoch-3b-preview": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8,
-                                  "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
-    "stable-zephyr-3b-dpo": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0,
-                             "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
-    "stable-code-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
-    "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
-    "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
-    "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
-    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True},
-    "open-llama-3b-v2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0,
-                         "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
-    "falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
-    "orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
-    "bloomz-560m": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8,
-                    "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
-    "mixtral-8x7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
-    "baichuan2-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8,
-                          "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
-    "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
-    "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
-    "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
-    "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
-    "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
-    "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
-    "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
-    "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
-}