Skip to content

Commit

Permalink
Enable 4BIT_MAXIMUM compression option (#345)
Browse files Browse the repository at this point in the history
4BIT_MAXIMUM compression option has the following settings:

```
"4BIT_MAXIMUM": {
    "mode": nncf.CompressWeightsMode.INT4_SYM,
    "group_size": 128,
    "ratio": 1,
}
```
  • Loading branch information
skuros authored Apr 9, 2024
1 parent 053dd88 commit b8d243f
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 7 deletions.
2 changes: 1 addition & 1 deletion llm_bench/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Paramters:
* `--output_dir` - output directory for saving OpenVINO model
* `--precision` - (optional, default FP32), precision for model conversion FP32 or FP16
* `--save_orig` - flag for saving original pytorch model, model will be located in `<output_dir>/pytorch` subdirectory.
* `--compress_weights` - The weight compression option, INT8 - INT8 weights, 4BIT_DEFAULT - for 4-bit weights compression with predefined configuration, INT4_SYM - for INT4 compressed weights with symmetric quantization, INT4_ASYM - for INT4 compressed weights with assymetric quantization. You can specify multiple backends separated by a space.
* `--compress_weights` - The weight compression option, INT8 - INT8 weights, 4BIT_DEFAULT - for 4-bit compression with predefined configs with performance-accuracy trade-off, 4BIT_MAXIMUM - for 4-bit compression with predefined configs for the best performance, INT4_SYM - for INT4 compressed weights with symmetric quantization, INT4_ASYM - for INT4 compressed weights with assymetric quantization. You can specify multiple backends separated by a space.
* `--compress_weights_backends` - (optional, default openvino) backends for weights compression, this option has an effect only with `--compress_weights`. You can specify multiple backends separated by a space.
* `--ratio` - Compression ratio between primary and backup precision, e.g. INT4/INT8.
* `--group_size` - Size of the group of weights that share the same quantization parameters
Expand Down
5 changes: 3 additions & 2 deletions llm_bench/python/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1350,11 +1350,12 @@ def main():
"-c",
"--compress_weights",
type=str,
choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "INT4_SYM", "INT4_ASYM"],
choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "4BIT_MAXIMUM", "INT4_SYM", "INT4_ASYM"],
nargs="+",
help=(
"The weight compression option, e.g. INT8 - INT8 weights (deprecated, please use INT8_ASYM instead), "
"4BIT_DEFAULT - for 4-bit compression with predefined configs, "
"4BIT_DEFAULT - for 4-bit compression with predefined configs with performance-accuracy trade-off, "
"4BIT_MAXIMUM - for 4-bit compression with predefined configs for the best performance, "
"INT4_* - for INT4 compressed weights."
),
)
Expand Down
3 changes: 2 additions & 1 deletion llm_bench/python/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,8 @@ def get_model_precision(model_name_list):
'OV_FP32-INT8_ASYM', 'OV_FP32-INT8_SYM', 'OV_FP16-INT8_ASYM', 'OV_FP16-INT8_SYM',
'PT_FP32-INT8', 'PT_FP16-INT8', 'PT_FP32-INT8_ASYM', 'PT_FP32-INT8_SYM', 'PT_FP16-INT8_ASYM', 'PT_FP16-INT8_SYM',
'GPTQ_INT4-FP32', 'GPTQ_INT4-FP16', 'INT4',
'OV_FP16-INT4_SYM', 'OV_FP16-INT4_ASYM', 'OV_FP32-INT4_SYM', 'OV_FP32-INT4_ASYM', 'OV_FP32-4BIT_DEFAULT', 'OV_FP16-4BIT_DEFAULT']
'OV_FP16-INT4_SYM', 'OV_FP16-INT4_ASYM', 'OV_FP32-INT4_SYM', 'OV_FP32-INT4_ASYM',
'OV_FP32-4BIT_DEFAULT', 'OV_FP16-4BIT_DEFAULT', 'OV_FP32-4BIT_MAXIMUM', 'OV_FP16-4BIT_MAXIMUM']
model_precision = 'unknown'
# Search from right to left of model path
for i in range(len(model_name_list) - 1, -1, -1):
Expand Down
11 changes: 8 additions & 3 deletions llm_bench/python/utils/nncf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

import nncf


COMPRESSION_OPTIONS = {
"INT8": {"mode": nncf.CompressWeightsMode.INT8 if "INT8_ASYM" not in nncf.CompressWeightsMode.__members__ else nncf.CompressWeightsMode.INT8_ASYM},
"INT8": {
"mode": nncf.CompressWeightsMode.INT8 if "INT8_ASYM" not in nncf.CompressWeightsMode.__members__ else nncf.CompressWeightsMode.INT8_ASYM},
"INT4_SYM": {
"mode": nncf.CompressWeightsMode.INT4_SYM,
"group_size": 128,
Expand All @@ -13,9 +13,14 @@
"mode": nncf.CompressWeightsMode.INT4_ASYM,
"group_size": 128,
},
"4BIT_MAXIMUM": {
"mode": nncf.CompressWeightsMode.INT4_SYM,
"group_size": 128,
"ratio": 1,
"all_layers": True,
},
}


if "INT8_ASYM" in nncf.CompressWeightsMode.__members__:
COMPRESSION_OPTIONS["INT8_ASYM"] = {"mode": nncf.CompressWeightsMode.INT8_ASYM}

Expand Down

0 comments on commit b8d243f

Please sign in to comment.