From 7312b7ae44d088cc1b4ce4943ff9a1b19cabc0c2 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 17 Dec 2024 06:59:40 -0500
Subject: [PATCH] fix all auto-gptq tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 optimum/gptq/quantizer.py       | 11 +++++++++--
 optimum/gptq/utils.py           |  2 +-
 optimum/utils/testing_utils.py  |  6 +++---
 tests/gptq/test_quantization.py |  5 ++++-
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 976f6418b3b..6a845bb23db 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -32,7 +32,14 @@
 from ..version import __version__ as optimum_version
 from .constants import GPTQ_CONFIG
 from .data import get_dataset, prepare_dataset
-from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
+from .utils import (
+    get_block_name_with_pattern,
+    get_device,
+    get_layers,
+    get_preceding_modules,
+    get_seqlen,
+    nested_move_to,
+)
 
 
 if is_accelerate_available():
@@ -53,7 +60,7 @@
     from gptqmodel import exllama_set_max_input_length
     from gptqmodel.quantization import GPTQ
     from gptqmodel.utils.importer import hf_select_quant_linear
-    from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format, nested_move_to
+    from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format
     from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init
     from gptqmodel.version import __version__ as gptqmodel_version
 
diff --git a/optimum/gptq/utils.py b/optimum/gptq/utils.py
index 2b842253a42..732ecbd66b9 100644
--- a/optimum/gptq/utils.py
+++ b/optimum/gptq/utils.py
@@ -115,7 +115,7 @@ def get_seqlen(model: nn.Module):
     return 2048
 
 
-def move_to(obj: torch.Tensor | nn.Module, device: torch.device):
+def move_to(obj: torch.Tensor, device: torch.device):
     if get_device(obj) != device:
         obj = obj.to(device)
     return obj
diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index 5bf57f4d41b..7a5e27860a4 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -65,9 +65,9 @@ def require_gptq(test_case):
     """
     Decorator marking a test that requires gptqmodel or auto-gptq. These tests are skipped when gptqmodel and auto-gptq are not installed.
     """
-    return unittest.skipUnless(is_auto_gptq_available() or is_gptqmodel_available(), "test requires auto-gptq")(
-        test_case
-    )
+    return unittest.skipUnless(
+        is_auto_gptq_available() or is_gptqmodel_available(), "test requires gptqmodel or auto-gptq"
+    )(test_case)
 
 
 def require_torch_gpu(test_case):
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index b6b50fb617d..0cf2416bb17 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -152,6 +152,9 @@ def test_serialization(self):
         """
         Test the serialization of the model and the loading of the quantized weights
         """
+        # AutoGPTQ does not support CPU
+        if self.device_map_for_quantization == "cpu" and not is_gptqmodel_available():
+            return
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantizer.save(self.quantized_model, tmpdirname)
@@ -309,7 +312,7 @@ def test_exllama_serialization(self):
                 device_map={"": self.device_for_inference},
             )
             self.check_quantized_layers_type(
-                quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2"
+                quantized_model_from_saved, "exllama" if is_gptqmodel_available() else "exllamav2"
             )
 
             # transformers and auto-gptq compatibility