Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fp8 implementation #1100

Merged
merged 11 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Check out the help for more options:

```text
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
Expand Down Expand Up @@ -67,10 +67,9 @@ Optional arguments:
on your local machine arbitrary code present in the model repository.
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
The weight format of the exported model.
--quant-mode {int8}
--quant-mode {int8,f8e4m3,f8e5m2}
Quantization precision mode. This is used for applying full model quantization including
activations. The only currently supported choice is 'int8' for int8 quantization of both
weights and activations.
activations.
--library {transformers,diffusers,timm,sentence_transformers,open_clip}
The library used to load the model before export. If not provided, will attempt to infer the
local checkpoint's library
Expand Down Expand Up @@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
</Tip>


Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Currently this is only supported for speech-to-text models. Please see example below.
KodiaqQ marked this conversation as resolved.
Show resolved Hide resolved

```bash
optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo
Expand Down
6 changes: 1 addition & 5 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
optional_group.add_argument(
"--quant-mode",
type=str,
choices=["int8"],
choices=["int8", "f8e4m3", "f8e5m2"],
default=None,
help=(
"Quantization precision mode. This is used for applying full model quantization including activations. "
"The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
),
)
optional_group.add_argument(
Expand Down Expand Up @@ -365,9 +364,6 @@ def run(self):
quantization_config["trust_remote_code"] = self.args.trust_remote_code
ov_config = OVConfig(quantization_config=quantization_config)
else:
if self.args.quant_mode != "int8":
raise ValueError("Only 'int8' quantization mode is currently supported.")

quantization_config = {
"weight_format": self.args.quant_mode,
"activation_format": self.args.quant_mode,
Expand Down
29 changes: 12 additions & 17 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from optimum.configuration_utils import BaseConfig

from ..utils.import_utils import is_nncf_available
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS


if is_nncf_available():
Expand Down Expand Up @@ -638,9 +638,9 @@ def __init__(
SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and
reduces quantization error.
weight_format (`str`, defaults to "int8"):
Data format weights are quantized to. Possible values: ['int8'].
Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
activation_format (`str`, defaults to "int8"):
Data format activations are compressed to. Possible values: ['int8'].
Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
"""
super().__init__(
bits=bits,
Expand Down Expand Up @@ -669,23 +669,20 @@ def post_init(self):
if self.bits != 8:
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")

if self.dataset is not None:
if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS:
raise ValueError(
f"You have entered the following string value for dataset: {self.dataset}. But it is not supported."
f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}."
)

if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1):
raise ValueError(
f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}"
)

if self.weight_format != "int8":
raise ValueError("Only 'int8' weight format is currently supported.")

if self.activation_format != "int8":
raise ValueError("Only 'int8' activation format is currently supported.")
if not self.sym:
if self.activation_format != "int8":
raise ValueError(
f"Asymmetric quantization can not be performed in {self.activation_format} activation format."
)
if self.weight_format != "int8":
raise ValueError(
f"Asymmetric quantization can not be performed in {self.weight_format} weight format."
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest to initialize sym as True inside OVQuantizatioConfig constructor if fp8 mode is selected. This option is intended to be used with int data types and does not quite make sense with fp8 data types. Also, this way --sym won't be needed to be specified every time fp8 modes are used.

cc @AlexKoff88



class OVConfig(BaseConfig):
Expand Down Expand Up @@ -713,8 +710,6 @@ def __init__(
if self.quantization_config is not None:
if isinstance(self.quantization_config, OVWeightQuantizationConfig):
self.dtype = self.quantization_config.weight_format
else:
self.dtype = "int8"
else:
self.dtype = dtype
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this should be changed to:

if self.quantization_config is not None:
    self.dtype = self.quantization_config.weight_format
else:
    self.dtype = dtype


Expand Down
13 changes: 8 additions & 5 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,11 +458,6 @@ def _quantize_ovbasemodel(
if calibration_dataset is None:
raise ValueError("Calibration dataset is required to run quantization.")

if quantization_config.weight_format != "int8":
raise ValueError("Only 'int8' weight format is currently supported.")
if quantization_config.activation_format != "int8":
raise ValueError("Only 'int8' activation format is currently supported.")

# Quantize model(s)
if isinstance(self.model, _OVModelForWhisper):
self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs)
Expand Down Expand Up @@ -1071,6 +1066,14 @@ def _full_quantization(
matmul=quantization_config.smooth_quant_alpha
)

q_mode_map = {
"f8e4m3": nncf.QuantizationMode.FP8_E4M3,
"f8e5m2": nncf.QuantizationMode.FP8_E5M2,
}

if quantization_config.activation_format in q_mode_map:
kwargs.update({"mode": q_mode_map[quantization_config.activation_format]})

quantized_model = nncf.quantize(
model,
calibration_dataset,
Expand Down
28 changes: 19 additions & 9 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,19 @@ class OVCLIExportTestCase(unittest.TestCase):
(
"automatic-speech-recognition",
"whisper",
"--quant-mode int8 --dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
"int8",
"--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
(14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25),
(14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18),
),
(
"text-generation",
"phi3",
"f8e4m3",
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code --sym",
(13,),
(10,),
),
]

TEST_4BIT_CONFIGURATIONS = [
Expand Down Expand Up @@ -407,26 +416,27 @@ def test_exporters_cli_full_quantization(
self,
task: str,
model_type: str,
quant_mode: str,
option: str,
expected_num_fq_nodes_per_model: Tuple[int],
expected_num_weight_nodes_per_model: Tuple[int],
):
with TemporaryDirectory() as tmpdir:
subprocess.run(
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} {option} {tmpdir}",
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --quant-mode {quant_mode} {option} {tmpdir}",
shell=True,
check=True,
)
model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir)

submodels = []
models = [model]
if task == "automatic-speech-recognition":
submodels = [model.encoder, model.decoder, model.decoder_with_past]
self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels))
for i, model in enumerate(submodels):
actual_num_fq_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_fq_nodes)
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes["int8"])
models = [model.encoder, model.decoder, model.decoder_with_past]
self.assertEqual(len(expected_num_fq_nodes_per_model), len(models))
for i, model in enumerate(models):
actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_f_nodes)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_f_nodes)
self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes)

self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode])

def test_exporters_cli_int4_with_local_model_and_default_config(self):
with TemporaryDirectory() as tmpdir:
Expand Down
38 changes: 19 additions & 19 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,31 +202,31 @@


def get_num_quantized_nodes(model):
num_fake_quantize = 0
num_weight_nodes = {
"int8": 0,
"int4": 0,
"f4e2m1": 0,
"f8e8m0": 0,
"nf4": 0,
num_fake_nodes = 0
types_map = {
"i8": "int8",
Comment on lines +210 to +211
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

"u8": "int8",
"i4": "int4",
"u4": "int4",
"f4e2m1": "f4e2m1",
"f8e8m0": "f8e8m0",
"nf4": "nf4",
"f8e4m3": "f8e4m3",
"f8e5m2": "f8e5m2",
}
num_weight_nodes = {n: 0 for n in types_map.values()}
ov_model = model if isinstance(model, ov.Model) else model.model
for elem in ov_model.get_ops():
if "FakeQuantize" in elem.name:
num_fake_quantize += 1
num_fake_nodes += 1
if "FakeConvert" in elem.name:
num_fake_nodes += 1
for i in range(elem.get_output_size()):
type_name = elem.get_output_element_type(i).get_type_name()
if type_name in ["i8", "u8"]:
num_weight_nodes["int8"] += 1
if type_name in ["i4", "u4"]:
num_weight_nodes["int4"] += 1
if type_name == "f4e2m1":
num_weight_nodes["f4e2m1"] += 1
if type_name == "f8e8m0":
num_weight_nodes["f8e8m0"] += 1
if type_name == "nf4":
num_weight_nodes["nf4"] += 1
return num_fake_quantize, num_weight_nodes
if type_name in types_map:
name = types_map[type_name]
num_weight_nodes[name] += 1
return num_fake_nodes, num_weight_nodes


@contextmanager
Expand Down
Loading