Enable 4BIT_MAXIMUM compression option (#345)

4BIT_MAXIMUM compression option has the following settings: ``` "4BIT_MAXIMUM": { "mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 1, } ```
openvinotoolkit · Apr 9, 2024 · b8d243f · b8d243f
1 parent 053dd88
commit b8d243f
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 7 deletions.
diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md
@@ -37,7 +37,7 @@ Paramters:
 * `--output_dir` - output directory for saving OpenVINO model
 * `--precision` - (optional, default FP32), precision for model conversion FP32 or FP16
 * `--save_orig` - flag for saving original pytorch model, model will be located in `<output_dir>/pytorch` subdirectory.
-* `--compress_weights` - The weight compression option, INT8 - INT8 weights, 4BIT_DEFAULT - for 4-bit weights compression with predefined configuration, INT4_SYM - for INT4 compressed weights with symmetric quantization, INT4_ASYM - for INT4 compressed weights with assymetric quantization. You can specify multiple backends separated by a space.
+* `--compress_weights` - The weight compression option, INT8 - INT8 weights, 4BIT_DEFAULT - for 4-bit compression with predefined configs with performance-accuracy trade-off, 4BIT_MAXIMUM - for 4-bit compression with predefined configs for the best performance, INT4_SYM - for INT4 compressed weights with symmetric quantization, INT4_ASYM - for INT4 compressed weights with assymetric quantization. You can specify multiple backends separated by a space.
 * `--compress_weights_backends` - (optional, default openvino) backends for weights compression, this option has an effect only with `--compress_weights`. You can specify multiple backends separated by a space.
 * `--ratio` - Compression ratio between primary and backup precision, e.g. INT4/INT8.
 * `--group_size` - Size of the group of weights that share the same quantization parameters

diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py
@@ -1350,11 +1350,12 @@ def main():
         "-c",
         "--compress_weights",
         type=str,
-        choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "INT4_SYM", "INT4_ASYM"],
+        choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "4BIT_MAXIMUM", "INT4_SYM", "INT4_ASYM"],
         nargs="+",
         help=(
             "The weight compression option, e.g. INT8 - INT8 weights (deprecated, please use INT8_ASYM instead), "
-            "4BIT_DEFAULT - for 4-bit compression with predefined configs, "
+            "4BIT_DEFAULT - for 4-bit compression with predefined configs with performance-accuracy trade-off, "
+            "4BIT_MAXIMUM - for 4-bit compression with predefined configs for the best performance, "
             "INT4_* - for INT4 compressed weights."
         ),
     )

diff --git a/llm_bench/python/utils/model_utils.py b/llm_bench/python/utils/model_utils.py
@@ -234,7 +234,8 @@ def get_model_precision(model_name_list):
         'OV_FP32-INT8_ASYM', 'OV_FP32-INT8_SYM', 'OV_FP16-INT8_ASYM', 'OV_FP16-INT8_SYM',
         'PT_FP32-INT8', 'PT_FP16-INT8', 'PT_FP32-INT8_ASYM', 'PT_FP32-INT8_SYM', 'PT_FP16-INT8_ASYM', 'PT_FP16-INT8_SYM',
         'GPTQ_INT4-FP32', 'GPTQ_INT4-FP16', 'INT4',
-        'OV_FP16-INT4_SYM', 'OV_FP16-INT4_ASYM', 'OV_FP32-INT4_SYM', 'OV_FP32-INT4_ASYM', 'OV_FP32-4BIT_DEFAULT', 'OV_FP16-4BIT_DEFAULT']
+        'OV_FP16-INT4_SYM', 'OV_FP16-INT4_ASYM', 'OV_FP32-INT4_SYM', 'OV_FP32-INT4_ASYM',
+        'OV_FP32-4BIT_DEFAULT', 'OV_FP16-4BIT_DEFAULT', 'OV_FP32-4BIT_MAXIMUM', 'OV_FP16-4BIT_MAXIMUM']
     model_precision = 'unknown'
     # Search from right to left of model path
     for i in range(len(model_name_list) - 1, -1, -1):

diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
@@ -2,9 +2,9 @@
 
 import nncf
 
-
 COMPRESSION_OPTIONS = {
-    "INT8": {"mode": nncf.CompressWeightsMode.INT8 if "INT8_ASYM" not in nncf.CompressWeightsMode.__members__ else nncf.CompressWeightsMode.INT8_ASYM},
+    "INT8": {
+        "mode": nncf.CompressWeightsMode.INT8 if "INT8_ASYM" not in nncf.CompressWeightsMode.__members__ else nncf.CompressWeightsMode.INT8_ASYM},
     "INT4_SYM": {
         "mode": nncf.CompressWeightsMode.INT4_SYM,
         "group_size": 128,
@@ -13,9 +13,14 @@
         "mode": nncf.CompressWeightsMode.INT4_ASYM,
         "group_size": 128,
     },
+    "4BIT_MAXIMUM": {
+        "mode": nncf.CompressWeightsMode.INT4_SYM,
+        "group_size": 128,
+        "ratio": 1,
+        "all_layers": True,
+    },
 }
 
-
 if "INT8_ASYM" in nncf.CompressWeightsMode.__members__:
     COMPRESSION_OPTIONS["INT8_ASYM"] = {"mode": nncf.CompressWeightsMode.INT8_ASYM}