diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 976f6418b3..6a845bb23d 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -32,7 +32,14 @@ from ..version import __version__ as optimum_version from .constants import GPTQ_CONFIG from .data import get_dataset, prepare_dataset -from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen +from .utils import ( + get_block_name_with_pattern, + get_device, + get_layers, + get_preceding_modules, + get_seqlen, + nested_move_to, +) if is_accelerate_available(): @@ -53,7 +60,7 @@ from gptqmodel import exllama_set_max_input_length from gptqmodel.quantization import GPTQ from gptqmodel.utils.importer import hf_select_quant_linear - from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format, nested_move_to + from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init from gptqmodel.version import __version__ as gptqmodel_version diff --git a/optimum/gptq/utils.py b/optimum/gptq/utils.py index 2b842253a4..732ecbd66b 100644 --- a/optimum/gptq/utils.py +++ b/optimum/gptq/utils.py @@ -115,7 +115,7 @@ def get_seqlen(model: nn.Module): return 2048 -def move_to(obj: torch.Tensor | nn.Module, device: torch.device): +def move_to(obj: torch.Tensor, device: torch.device): if get_device(obj) != device: obj = obj.to(device) return obj diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index 5bf57f4d41..7a5e27860a 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -65,9 +65,9 @@ def require_gptq(test_case): """ Decorator marking a test that requires gptqmodel or auto-gptq. These tests are skipped when gptqmodel and auto-gptq are not installed. """ - return unittest.skipUnless(is_auto_gptq_available() or is_gptqmodel_available(), "test requires auto-gptq")( - test_case - ) + return unittest.skipUnless( + is_auto_gptq_available() or is_gptqmodel_available(), "test requires gptqmodel or auto-gptq" + )(test_case) def require_torch_gpu(test_case): diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index b6b50fb617..0cf2416bb1 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -152,6 +152,9 @@ def test_serialization(self): """ Test the serialization of the model and the loading of the quantized weights """ + # AutoGPTQ does not support CPU + if self.device_map_for_quantization == "cpu" and not is_gptqmodel_available(): + return with tempfile.TemporaryDirectory() as tmpdirname: self.quantizer.save(self.quantized_model, tmpdirname) @@ -309,7 +312,7 @@ def test_exllama_serialization(self): device_map={"": self.device_for_inference}, ) self.check_quantized_layers_type( - quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2" + quantized_model_from_saved, "exllama" if is_gptqmodel_available() else "exllamav2" ) # transformers and auto-gptq compatibility