huggingface · SunMarc · Jan 15, 2025 · Nov 29, 2024 · Nov 29, 2024 · Dec 2, 2024
@@ -24,10 +24,19 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google
 
 The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate.
 
+Now, we are going to replace [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) with [GPTQModel](https://github.com/ModelCloud/GPTQModel), the auto_gptq will be deprecated in the future.
+
 Before you begin, make sure the following libraries are installed:
 
 ```bash
 pip install auto-gptq
+```
+or
+```bash
+pip install gptqmodel
+```
+
+```bash
 pip install --upgrade accelerate optimum transformers
 ```
 
@@ -110,7 +119,7 @@ Only 4-bit models are supported, and we recommend deactivating the ExLlama kerne
 
 </Tip>
 
-The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
+The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2) or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
 
 ```py
 import torch

@@ -53,7 +53,7 @@ Use the table below to help you decide which quantization method to use.
 | [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴         | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴         | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
-| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [GPTQ](./gptq)                                | 🔴                       | 🟡 ***  | 🟢        | 🟢              | 🔴                     | 🟡 ***        | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
 | [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
@@ -72,3 +72,9 @@ We value your feedback to help identify bugs before the full release! Check out
 \** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
 
 </Tip>
+
+<Tip>
+
+\*** GPTQ only supports 4-bit on Intel CPU / GPU.
+
+</Tip>
@@ -22,7 +22,7 @@
 if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
 
-from ..utils import is_auto_gptq_available, is_optimum_available, is_torch_available, logging
+from ..utils import is_auto_gptq_available, is_gptqmodel_available, is_optimum_available, is_torch_available, logging
 from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin
 
 
@@ -35,11 +35,11 @@
 class GptqHfQuantizer(HfQuantizer):
     """
     Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through
-    `auto_gptq` package. Quantization is done under the hood for users if they load a non-prequantized model.
+    `auto_gptq` or `gptqmodel` package. Quantization is done under the hood for users if they load a non-prequantized model.
     """
 
     requires_calibration = False
-    required_packages = ["optimum", "auto_gptq"]
+    required_packages = ["optimum", "auto_gptq", "gptqmodel"]
     optimum_quantizer = None
 
     def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
@@ -49,16 +49,21 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
         self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum())
 
     def validate_environment(self, *args, **kwargs):
-        gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        gptq_supports_cpu = (
+            is_auto_gptq_available()
+            and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        ) or is_gptqmodel_available()
         if not gptq_supports_cpu and not torch.cuda.is_available():
             raise RuntimeError("GPU is required to quantize or run quantize model.")
-        elif not (is_optimum_available() and is_auto_gptq_available()):
+        elif not (is_optimum_available() and (is_auto_gptq_available() or is_gptqmodel_available())):
             raise ImportError(
-                "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq library (`pip install auto-gptq`)"
+                "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq or gptqmodel library (`pip install auto-gptq` or `pip install gptqmodel`)"
             )
-        elif version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"):
+        elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse(
+            "0.4.2"
+        ):
             raise ImportError(
-                "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`"
+                "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`"
             )
 
     def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":

@@ -143,6 +143,7 @@
     is_g2p_en_available,
     is_galore_torch_available,
     is_gguf_available,
+    is_gptqmodel_available,
     is_grokadamw_available,
     is_hqq_available,
     is_in_notebook,

@@ -139,6 +139,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _openai_available = _is_package_available("openai")
 _optimum_available = _is_package_available("optimum")
 _auto_gptq_available = _is_package_available("auto_gptq")
+_gptqmodel_available = _is_package_available("gptqmodel")
 # `importlib.metadata.version` doesn't work with `awq`
 _auto_awq_available = importlib.util.find_spec("awq") is not None
 _quanto_available = _is_package_available("quanto")
@@ -1005,6 +1006,10 @@ def is_auto_gptq_available():
     return _auto_gptq_available
 
 
+def is_gptqmodel_available():
+    return _gptqmodel_available
+
+
 def is_eetq_available():
     return _eetq_available