diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index d92d1b850a..61ba67b303 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -29,10 +29,10 @@ from ..utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available from ..utils.modeling_utils import recurse_getattr +from ..version import __version__ as optimum_version from .constants import GPTQ_CONFIG from .data import get_dataset, prepare_dataset from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen -from ..version import __version__ as optimum_version if is_accelerate_available(): @@ -43,11 +43,11 @@ from accelerate.hooks import remove_hook_from_module if is_auto_gptq_available(): + from auto_gptq import __version__ as autogptq_version from auto_gptq import exllama_set_max_input_length from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init from auto_gptq.quantization import GPTQ from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear - from auto_gptq import __version__ as autogptq_version if is_gptqmodel_available(): from gptqmodel import exllama_set_max_input_length @@ -128,8 +128,7 @@ def __init__( Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta. i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"] backend (`str`, *optional*): - Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only - valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py + Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py use_cuda_fp16 (`bool`, defaults to `False`): Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. model_seqlen (`Optional[int]`, defaults to `None`): @@ -246,7 +245,7 @@ def to_dict(self): if gptq_dict.get("meta") is None: gptq_dict["meta"] = {} - + meta = gptq_dict["meta"] # store both optimum:version and gptq_lib:version into quantize_config.meta.quantizer if meta.get("quantizer") is None: @@ -719,7 +718,9 @@ class StoreAttr(object): pass if is_gptqmodel_available(): - model, _ = hf_convert_gptq_v1_to_v2_format(model, self.bits, self.quant_linear, self.checkpoint_format, self.meta) + model, _ = hf_convert_gptq_v1_to_v2_format( + model, self.bits, self.quant_linear, self.checkpoint_format, self.meta + ) model.quantize_config = StoreAttr() model.quantize_config.desc_act = self.desc_act @@ -790,9 +791,12 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa """ # convert gptqmodel internal gptq_v2 format to v1 for max compatibility - model, converted = hf_convert_gptq_v2_to_v1_format(model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta) - if converted: - self.checkpoint_format = "gptq" + if is_gptqmodel_available(): + model, converted = hf_convert_gptq_v2_to_v1_format( + model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta + ) + if converted: + self.checkpoint_format = "gptq" os.makedirs(save_dir, exist_ok=True) model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization) diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 8c2de4cae5..8f7635ce04 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -52,7 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0") DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0") AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99") # Allows 0.5.0.dev0 -GPTQMODEL_MINIMUM_VERSION = version.parse("1.3.99") # Allows 1.4.0.dev0 +GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.1") # Allows 1.4.0.dev0 # This is the minimal required version to support some ONNX Runtime features diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index b16e77fcc5..dbbedcec98 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -193,7 +193,6 @@ class GPTQTestCUDA(GPTQTest): expected_fp16_perplexity = 38 expected_quantized_perplexity = 45 - def test_perplexity(self): """ A simple test to check if the model conversion has been done correctly by checking on the @@ -309,7 +308,9 @@ def test_exllama_serialization(self): save_folder=tmpdirname, device_map={"": self.device_for_inference}, ) - self.check_quantized_layers_type(quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2") + self.check_quantized_layers_type( + quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2" + ) # transformers and auto-gptq compatibility # quantized models are more compatible with device map than