From 4c567b363febc56ef0d00ff5873fe1e0685f7d47 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 29 Nov 2024 15:22:27 +0000 Subject: [PATCH 01/42] gptqmodel Signed-off-by: jiqing-feng --- src/transformers/quantizers/quantizer_gptq.py | 16 ++++++++-------- src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 5 +++++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 233a5279d3f90e..32fe8d07e79fc3 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -22,7 +22,7 @@ if TYPE_CHECKING: from ..modeling_utils import PreTrainedModel -from ..utils import is_auto_gptq_available, is_optimum_available, is_torch_available, logging +from ..utils import is_auto_gptq_available, is_gptqmodel_available, is_optimum_available, is_torch_available, logging from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin @@ -35,11 +35,11 @@ class GptqHfQuantizer(HfQuantizer): """ Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through - `auto_gptq` package. Quantization is done under the hood for users if they load a non-prequantized model. + `auto_gptq` or `gptqmodel` package. Quantization is done under the hood for users if they load a non-prequantized model. """ requires_calibration = False - required_packages = ["optimum", "auto_gptq"] + required_packages = ["optimum", "gptqmodel"] optimum_quantizer = None def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): @@ -49,16 +49,16 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum()) def validate_environment(self, *args, **kwargs): - gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2") + gptq_supports_cpu = (is_auto_gptq_available() and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")) or is_gptqmodel_available() if not gptq_supports_cpu and not torch.cuda.is_available(): raise RuntimeError("GPU is required to quantize or run quantize model.") - elif not (is_optimum_available() and is_auto_gptq_available()): + elif not (is_optimum_available() and (is_auto_gptq_available() or is_gptqmodel_available())): raise ImportError( - "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq library (`pip install auto-gptq`)" + "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq or gptqmodel library (`pip install auto-gptq` or `pip install gptqmodel`)" ) - elif version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"): + elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"): raise ImportError( - "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`" + "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`" ) def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 492642d61babb5..06253e999b3aef 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -143,6 +143,7 @@ is_g2p_en_available, is_galore_torch_available, is_gguf_available, + is_gptqmodel_available, is_grokadamw_available, is_hqq_available, is_in_notebook, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 70bd236e3bb4ac..d3ab9cd58688fb 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -139,6 +139,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _openai_available = _is_package_available("openai") _optimum_available = _is_package_available("optimum") _auto_gptq_available = _is_package_available("auto_gptq") +_gptqmodel_available = _is_package_available("gptqmodel") # `importlib.metadata.version` doesn't work with `awq` _auto_awq_available = importlib.util.find_spec("awq") is not None _quanto_available = _is_package_available("quanto") @@ -1005,6 +1006,10 @@ def is_auto_gptq_available(): return _auto_gptq_available +def is_gptqmodel_available(): + return _gptqmodel_available + + def is_eetq_available(): return _eetq_available From 1d8f83e39f68c2e95950889225c9cb44c3e84511 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 29 Nov 2024 15:28:24 +0000 Subject: [PATCH 02/42] fix format Signed-off-by: jiqing-feng --- src/transformers/quantizers/quantizer_gptq.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 32fe8d07e79fc3..33589c1b5e2561 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -49,14 +49,19 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum()) def validate_environment(self, *args, **kwargs): - gptq_supports_cpu = (is_auto_gptq_available() and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")) or is_gptqmodel_available() + gptq_supports_cpu = ( + is_auto_gptq_available() + and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2") + ) or is_gptqmodel_available() if not gptq_supports_cpu and not torch.cuda.is_available(): raise RuntimeError("GPU is required to quantize or run quantize model.") elif not (is_optimum_available() and (is_auto_gptq_available() or is_gptqmodel_available())): raise ImportError( "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq or gptqmodel library (`pip install auto-gptq` or `pip install gptqmodel`)" ) - elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"): + elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse( + "0.4.2" + ): raise ImportError( "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`" ) From 9f44604c657ef18cbcc302adbb05eec2a4a66a53 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 2 Dec 2024 13:05:32 +0000 Subject: [PATCH 03/42] update readme Signed-off-by: jiqing-feng --- docs/source/en/quantization/gptq.md | 11 ++++++++++- docs/source/en/quantization/overview.md | 8 +++++++- src/transformers/quantizers/quantizer_gptq.py | 2 +- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 5713ef4132a9a8..dbbc95e7c1c59e 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -24,10 +24,19 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate. +Now, we are going to replace [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) with [GPTQModel](https://github.com/ModelCloud/GPTQModel), the auto_gptq will be deprecated in the future. + Before you begin, make sure the following libraries are installed: ```bash pip install auto-gptq +``` +or +```bash +pip install gptqmodel +``` + +```bash pip install --upgrade accelerate optimum transformers ``` @@ -110,7 +119,7 @@ Only 4-bit models are supported, and we recommend deactivating the ExLlama kerne -The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. +The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2) or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. ```py import torch diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 0fb72d26058e55..7dfbf26f78cbae 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -53,7 +53,7 @@ Use the table below to help you decide which quantization method to use. | [compressed-tensors](./compressed_tensors) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 1 - 8 | 🟒 | 🟒 | 🟒 | https://github.com/neuralmagic/compressed-tensors | | [EETQ](./eetq) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | ? | 8 | 🟒 | 🟒 | 🟒 | https://github.com/NetEase-FuXi/EETQ | | GGUF / GGML (llama.cpp) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | 1 - 8 | πŸ”΄ | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp | -| [GPTQ](./gptq) | πŸ”΄ | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 2 - 3 - 4 - 8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | +| [GPTQ](./gptq) | πŸ”΄ | 🟑 *** | 🟒 | 🟒 | πŸ”΄ | 🟑 *** | πŸ”΄ | 2 - 3 - 4 - 8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | | [HQQ](./hqq) | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 - 8 | 🟒 | πŸ”΄ | 🟒 | https://github.com/mobiusml/hqq/ | | [optimum-quanto](./quanto) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | 🟒 | 2 / 4 / 8 | πŸ”΄ | πŸ”΄ | 🟒 | https://github.com/huggingface/optimum-quanto | | [FBGEMM_FP8](./fbgemm_fp8.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | πŸ”΄ | 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/pytorch/FBGEMM | @@ -72,3 +72,9 @@ We value your feedback to help identify bugs before the full release! Check out \** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. + + + +\*** GPTQ only supports 4-bit on Intel CPU / GPU. + + diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 33589c1b5e2561..24ffa661f3d7ac 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -39,7 +39,7 @@ class GptqHfQuantizer(HfQuantizer): """ requires_calibration = False - required_packages = ["optimum", "gptqmodel"] + required_packages = ["optimum", "auto_gptq", "gptqmodel"] optimum_quantizer = None def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): From 8c883152aaddc8228435d232e95b12b4fac1ce18 Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud <165116337+LRL-ModelCloud@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:52:37 +0800 Subject: [PATCH 04/42] gptqmodel need use checkpoint_format (#1) * gptqmodel need use checkpoint_format * fix quantize * Update quantization_config.py * Update quantization_config.py * Update quantization_config.py --------- Co-authored-by: ZX-ModelCloud Co-authored-by: Qubitium-ModelCloud --- src/transformers/quantizers/quantizer_gptq.py | 6 ++++-- src/transformers/utils/quantization_config.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 24ffa661f3d7ac..6fc1fe955ac18f 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -78,11 +78,13 @@ def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwarg raise RuntimeError("We can only quantize pure text model.") if self.pre_quantized: - model = self.optimum_quantizer.convert_model(model) + model = self.optimum_quantizer.convert_model(model, **kwargs) + else: + self.optimum_quantizer.quantize_preprocess(model, **kwargs) def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): if self.pre_quantized: - model = self.optimum_quantizer.post_init_model(model) + model = self.optimum_quantizer.post_init_model(model, **kwargs) else: if self.quantization_config.tokenizer is None: self.quantization_config.tokenizer = model.name_or_path diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index bacbca94cd823f..36c7bb2afbe17c 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -575,6 +575,8 @@ class GPTQConfig(QuantizationConfigMixin): Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers. + checkpoint_format (`str`, *optional*, defaults to `gptq`): + GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only. use_cuda_fp16 (`bool`, *optional*, defaults to `False`): Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. model_seqlen (`int`, *optional*): @@ -616,6 +618,7 @@ def __init__( desc_act: bool = False, sym: bool = True, true_sequential: bool = True, + checkpoint_format: Optional[str] = "gptq", use_cuda_fp16: bool = False, model_seqlen: Optional[int] = None, block_name_to_quantize: Optional[str] = None, @@ -650,6 +653,7 @@ def __init__( self.disable_exllama = kwargs.pop("disable_exllama", None) self.cache_block_outputs = cache_block_outputs self.modules_in_block_to_quantize = modules_in_block_to_quantize + self.checkpoint_format = checkpoint_format self.post_init() def get_loading_attributes(self): From ef0fb56c99d6e74f7944647816386bd9c251a1fa Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud <165116337+LRL-ModelCloud@users.noreply.github.com> Date: Wed, 4 Dec 2024 08:55:27 +0800 Subject: [PATCH 05/42] Revert quantizer_gptq.py (#2) * revert quantizer_gptq.py change * pass **kwargs --- src/transformers/quantizers/quantizer_gptq.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 6fc1fe955ac18f..eec628e6d1b724 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -79,12 +79,10 @@ def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwarg if self.pre_quantized: model = self.optimum_quantizer.convert_model(model, **kwargs) - else: - self.optimum_quantizer.quantize_preprocess(model, **kwargs) def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): if self.pre_quantized: - model = self.optimum_quantizer.post_init_model(model, **kwargs) + model = self.optimum_quantizer.post_init_model(model) else: if self.quantization_config.tokenizer is None: self.quantization_config.tokenizer = model.name_or_path From 065596065a6ba13fa943d72ec732b6d31feb36df Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 4 Dec 2024 10:32:20 +0000 Subject: [PATCH 06/42] limit gptqmodel and optimum version Signed-off-by: jiqing-feng --- .../models/gemma/configuration_gemma.py | 1 - src/transformers/quantizers/quantizer_gptq.py | 13 ++++++++++--- src/transformers/utils/quantization_config.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index e170803cccab70..346f386ba698f2 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,7 +20,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from ...configuration_utils import PretrainedConfig diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index eec628e6d1b724..9d401cfe1d8892 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -49,22 +49,29 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum()) def validate_environment(self, *args, **kwargs): + if not is_optimum_available(): + raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)") + if is_auto_gptq_available() and is_gptqmodel_available(): + logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel, auto-gptq will be deprecated in the future.") + gptq_supports_cpu = ( is_auto_gptq_available() and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2") ) or is_gptqmodel_available() if not gptq_supports_cpu and not torch.cuda.is_available(): raise RuntimeError("GPU is required to quantize or run quantize model.") - elif not (is_optimum_available() and (is_auto_gptq_available() or is_gptqmodel_available())): + elif not (is_auto_gptq_available() or is_gptqmodel_available()): raise ImportError( - "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq or gptqmodel library (`pip install auto-gptq` or `pip install gptqmodel`)" + "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq or gptqmodel library (`pip install auto-gptq` or `pip install gptqmodel`). Please notice that auto-gptq will be deprecated in the future." ) elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse( "0.4.2" ): raise ImportError( - "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`" + "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`. Please notice that auto-gptq will be deprecated in the future." ) + elif is_gptqmodel_available() and (version.parse(importlib.metadata.version("gptqmodel")) <= version.parse("1.3.0") or version.parse(importlib.metadata.version("optimum")) < version.parse("1.24.0")): + raise ImportError("The gptqmodel version should be > 1.3.0, optimum version should >= 1.24.0") def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": if torch_dtype is None: diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 36c7bb2afbe17c..9f0d55ff1a2446 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -575,7 +575,7 @@ class GPTQConfig(QuantizationConfigMixin): Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers. - checkpoint_format (`str`, *optional*, defaults to `gptq`): + checkpoint_format (`str`, *optional*, defaults to `"gptq"`): GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only. use_cuda_fp16 (`bool`, *optional*, defaults to `False`): Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. From be914eaf958f9970a748549ba6906fc19427eefd Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 4 Dec 2024 10:34:27 +0000 Subject: [PATCH 07/42] fix format Signed-off-by: jiqing-feng --- src/transformers/quantizers/quantizer_gptq.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 9d401cfe1d8892..814b5c5a15cf1b 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -52,7 +52,9 @@ def validate_environment(self, *args, **kwargs): if not is_optimum_available(): raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)") if is_auto_gptq_available() and is_gptqmodel_available(): - logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel, auto-gptq will be deprecated in the future.") + logger.warning( + "Detected gptqmodel and auto-gptq, will use gptqmodel, auto-gptq will be deprecated in the future." + ) gptq_supports_cpu = ( is_auto_gptq_available() @@ -70,7 +72,10 @@ def validate_environment(self, *args, **kwargs): raise ImportError( "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`. Please notice that auto-gptq will be deprecated in the future." ) - elif is_gptqmodel_available() and (version.parse(importlib.metadata.version("gptqmodel")) <= version.parse("1.3.0") or version.parse(importlib.metadata.version("optimum")) < version.parse("1.24.0")): + elif is_gptqmodel_available() and ( + version.parse(importlib.metadata.version("gptqmodel")) <= version.parse("1.3.0") + or version.parse(importlib.metadata.version("optimum")) < version.parse("1.24.0") + ): raise ImportError("The gptqmodel version should be > 1.3.0, optimum version should >= 1.24.0") def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": From aa9a5c61e4dbf9f8faea3feb1efa53ccdf41bb50 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 4 Dec 2024 10:45:34 +0000 Subject: [PATCH 08/42] fix warning Signed-off-by: jiqing-feng --- src/transformers/quantizers/quantizer_gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 814b5c5a15cf1b..f7b516bb95c5c3 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -64,7 +64,7 @@ def validate_environment(self, *args, **kwargs): raise RuntimeError("GPU is required to quantize or run quantize model.") elif not (is_auto_gptq_available() or is_gptqmodel_available()): raise ImportError( - "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq or gptqmodel library (`pip install auto-gptq` or `pip install gptqmodel`). Please notice that auto-gptq will be deprecated in the future." + "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. Please notice that auto-gptq will be deprecated in the future." ) elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse( "0.4.2" From a4bc251e4ae5628a9b9d70dfb34b5ea8baf71808 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 4 Dec 2024 11:07:55 +0000 Subject: [PATCH 09/42] fix version check Signed-off-by: jiqing-feng --- src/transformers/quantizers/quantizer_gptq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index f7b516bb95c5c3..a6ae314da0719e 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -73,10 +73,10 @@ def validate_environment(self, *args, **kwargs): "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`. Please notice that auto-gptq will be deprecated in the future." ) elif is_gptqmodel_available() and ( - version.parse(importlib.metadata.version("gptqmodel")) <= version.parse("1.3.0") - or version.parse(importlib.metadata.version("optimum")) < version.parse("1.24.0") + version.parse(importlib.metadata.version("gptqmodel")) <= version.parse("1.3.1") + or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99") ): - raise ImportError("The gptqmodel version should be > 1.3.0, optimum version should >= 1.24.0") + raise ImportError("The gptqmodel version should be >= 1.3.2, optimum version should >= 1.24.0") def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": if torch_dtype is None: From 9ae979b59c4dc54dacbc32c4aefbbff8536cf634 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 4 Dec 2024 13:18:22 +0000 Subject: [PATCH 10/42] revert unrelated changes Signed-off-by: jiqing-feng --- src/transformers/models/gemma/configuration_gemma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 346f386ba698f2..e170803cccab70 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,6 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from ...configuration_utils import PretrainedConfig From a73a8c25ccac99bef343d6b807095af982340c28 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 4 Dec 2024 14:53:49 +0000 Subject: [PATCH 11/42] enable gptqmodel tests Signed-off-by: jiqing-feng --- src/transformers/testing_utils.py | 7 +- tests/quantization/gptq/test_gptq.py | 161 +++++++++++++++------------ 2 files changed, 97 insertions(+), 71 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 25d837ccec0fbe..39a62ffb4ee41e 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -80,6 +80,7 @@ is_g2p_en_available, is_galore_torch_available, is_gguf_available, + is_gptqmodel_available, is_grokadamw_available, is_ipex_available, is_jieba_available, @@ -1192,11 +1193,13 @@ def require_tensorboard(test_case): return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard") -def require_auto_gptq(test_case): +def require_gptq(test_case): """ Decorator for auto_gptq dependency """ - return unittest.skipUnless(is_auto_gptq_available(), "test requires auto-gptq")(test_case) + return unittest.skipUnless( + is_gptqmodel_available() or is_auto_gptq_available(), "test requires gptqmodel or auto-gptq" + )(test_case) def require_auto_awq(test_case): diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index b1be9ac8c682c0..7dc410f2cd90ac 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -22,12 +22,13 @@ from transformers.testing_utils import ( is_torch_available, require_accelerate, - require_auto_gptq, + require_gptq, require_optimum, require_torch_gpu, require_torch_multi_gpu, slow, ) +from transformers.utils import is_auto_gptq_available, is_gptqmodel_available if is_torch_available(): @@ -76,23 +77,17 @@ def test_optimum_config(self): @slow @require_optimum -@require_auto_gptq -@require_torch_gpu +@require_gptq class GPTQTest(unittest.TestCase): - model_name = "bigscience/bloom-560m" + model_name = "Felladrin/Llama-160M-Chat-v1" input_text = "Hello my name is" EXPECTED_OUTPUTS = set() - EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I") - EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") - EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of") - EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.") - EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the") - EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,") + EXPECTED_OUTPUTS.add("Hello my name is Katie, I am a 22 year") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings - EXPECTED_RELATIVE_DIFFERENCE = 1.664253062 + EXPECTED_RELATIVE_DIFFERENCE = 2.06183008 bits = 4 group_size = 128 @@ -103,7 +98,7 @@ class GPTQTest(unittest.TestCase): "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." ] - device_map = None + device_map = "cpu" # called only once for all test in this class @classmethod @@ -150,7 +145,7 @@ def test_device_and_dtype_assignment(self): Checks also if other models are casted correctly. """ # This should work - if self.device_map is None: + if self.device_map == "cpu" and torch.cuda.is_available(): _ = self.quantized_model.to(0) with self.assertRaises(ValueError): @@ -170,17 +165,29 @@ def test_quantized_layers_class(self): Simple test to check if the model conversion has been done correctly by checking on the class type of the linear layers of the converted models """ - from auto_gptq.utils.import_utils import dynamically_import_QuantLinear - - QuantLinear = dynamically_import_QuantLinear( - use_triton=False, - desc_act=self.desc_act, - group_size=self.group_size, - bits=self.bits, - disable_exllama=not self.use_exllama, - disable_exllamav2=True, - ) - self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) + if is_gptqmodel_available(): + from gptqmodel.utils.importer import hf_select_quant_linear + + QuantLinear = hf_select_quant_linear( + bits=self.bits, + group_size=self.group_size, + desc_act=self.desc_act, + sym=True, + device_map=self.device_map, + pack=False, + ) + elif is_auto_gptq_available(): + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear + + QuantLinear = hf_select_quant_linear( + use_triton=False, + desc_act=self.desc_act, + group_size=self.group_size, + bits=self.bits, + disable_exllama=not self.use_exllama, + disable_exllamav2=True, + ) + self.assertTrue(self.quantized_model.model.layers[0].mlp.gate_proj.__class__ == QuantLinear) def check_inference_correctness(self, model): r""" @@ -192,19 +199,19 @@ def check_inference_correctness(self, model): encoded_input = self.tokenizer(self.input_text, return_tensors="pt") # Check the exactness of the results - output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10) + output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(model.device), max_new_tokens=10) # Get the generation self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) def check_quantized_layers_type(self, model, value): - self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value) + self.assertTrue(model.model.layers[0].mlp.gate_proj.QUANT_TYPE == value) def test_generate_quality(self): """ Simple test to check the quality of the model by comparing the generated tokens with the expected tokens """ - if self.device_map is None: + if self.device_map != "cpu": self.check_inference_correctness(self.quantized_model.to(0)) else: self.check_inference_correctness(self.quantized_model) @@ -215,15 +222,25 @@ def test_serialization(self): """ with tempfile.TemporaryDirectory() as tmpdirname: self.quantized_model.save_pretrained(tmpdirname) - if not self.use_exllama: - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( - tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4) - ).to(0) - self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old") + if is_auto_gptq_available() and not is_gptqmodel_available(): + quant_type = "cuda-old" if not self.use_exllama else "exllama" + if not self.use_exllama: + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( + tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4) + ) + if self.device_map != "cpu": + quantized_model_from_saved = quantized_model_from_saved.to(0) + else: + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( + tmpdirname, device_map=self.device_map + ) else: - # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": 0}) - self.check_quantized_layers_type(quantized_model_from_saved, "exllama") + quant_type = "ipex" if self.device_map == "cpu" else "cuda" + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( + tmpdirname, device_map=self.device_map + ) + + self.check_quantized_layers_type(quantized_model_from_saved, quant_type) self.check_inference_correctness(quantized_model_from_saved) @require_accelerate @@ -233,20 +250,26 @@ def test_serialization_big_model_inference(self): """ with tempfile.TemporaryDirectory() as tmpdirname: self.quantized_model.save_pretrained(tmpdirname) - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto") + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map) self.check_inference_correctness(quantized_model_from_saved) + +@require_torch_gpu +class GPTQTestCUDA(GPTQTest): + EXPECTED_RELATIVE_DIFFERENCE = 2.06183008 + device_map = {"": 0} + def test_change_loading_attributes(self): """ Test the serialization of the model and the loading of the quantized weights works with another config file """ with tempfile.TemporaryDirectory() as tmpdirname: self.quantized_model.save_pretrained(tmpdirname) - if not self.use_exllama: + if is_auto_gptq_available() and not is_gptqmodel_available() and not self.use_exllama: self.check_quantized_layers_type(self.quantized_model, "cuda-old") # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( - tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map={"": 0} + tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map=self.device_map ) self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits) self.check_quantized_layers_type(quantized_model_from_saved, "exllama") @@ -255,20 +278,20 @@ def test_change_loading_attributes(self): @require_accelerate @require_torch_multi_gpu -class GPTQTestDeviceMap(GPTQTest): +class GPTQTestDeviceMap(GPTQTestCUDA): device_map = "auto" @require_accelerate @require_torch_multi_gpu -class GPTQTestDeviceMapExllama(GPTQTest): +class GPTQTestDeviceMapExllama(GPTQTestCUDA): device_map = "auto" use_exllama = True @slow @require_optimum -@require_auto_gptq +@require_gptq @require_torch_gpu @require_accelerate class GPTQTestActOrderExllama(unittest.TestCase): @@ -343,7 +366,7 @@ def test_max_input_length(self): @slow @require_optimum -@require_auto_gptq +@require_gptq @require_torch_gpu @require_accelerate class GPTQTestExllamaV2(unittest.TestCase): @@ -405,32 +428,32 @@ def test_generate_quality(self): @require_torch_multi_gpu class GPTQTestDeviceMapCPUOffload(GPTQTest): device_map = { - "transformer.word_embeddings": 0, - "transformer.word_embeddings_layernorm": 0, + "model.embed_tokens": 0, + "model.norm": 0, "lm_head": 0, - "transformer.h.0": 0, - "transformer.h.1": 0, - "transformer.h.2": 0, - "transformer.h.3": 0, - "transformer.h.4": 0, - "transformer.h.5": 0, - "transformer.h.6": 0, - "transformer.h.7": 0, - "transformer.h.8": 0, - "transformer.h.9": 0, - "transformer.h.10": 1, - "transformer.h.11": 1, - "transformer.h.12": 1, - "transformer.h.13": 1, - "transformer.h.14": 1, - "transformer.h.15": 1, - "transformer.h.16": 1, - "transformer.h.17": 0, - "transformer.h.18": "cpu", - "transformer.h.19": "cpu", - "transformer.h.20": "cpu", - "transformer.h.21": "cpu", - "transformer.h.22": "cpu", - "transformer.h.23": 1, - "transformer.ln_f": 0, + "model.layer.0": 0, + "model.layer.1": 0, + "model.layer.2": 0, + "model.layer.3": 0, + "model.layer.4": 0, + "model.layer.5": 0, + "model.layer.6": 0, + "model.layer.7": 0, + "model.layer.8": 0, + "model.layer.9": 0, + "model.layer.10": 1, + "model.layer.11": 1, + "model.layer.12": 1, + "model.layer.13": 1, + "model.layer.14": 1, + "model.layer.15": 1, + "model.layer.16": 1, + "model.layer.17": 0, + "model.layer.18": "cpu", + "model.layer.19": "cpu", + "model.layer.20": "cpu", + "model.layer.21": "cpu", + "model.layer.22": "cpu", + "model.layer.23": 1, + "model.rotary_emb": 0, } From c18a5f1411d33ffabcb885ccd13e6e98cb46358a Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 4 Dec 2024 15:03:00 +0000 Subject: [PATCH 12/42] fix requires gptq Signed-off-by: jiqing-feng --- tests/utils/test_cache_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py index 4a6dae67cbc807..053d2cf6397a17 100644 --- a/tests/utils/test_cache_utils.py +++ b/tests/utils/test_cache_utils.py @@ -21,7 +21,7 @@ from transformers import set_seed from transformers.testing_utils import ( is_torch_available, - require_auto_gptq, + require_gptq, require_non_xpu, require_read_token, require_torch, @@ -319,7 +319,7 @@ def test_hybrid_cache_n_sequences(self): self.assertListEqual(decoded, expected_text) @require_non_xpu - @require_auto_gptq + @require_gptq def test_sink_cache_hard(self): tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ") model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto") From 27ac615f3f085880c95aff4026c30f5c2d332574 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Date: Thu, 5 Dec 2024 20:50:33 +0800 Subject: [PATCH 13/42] Fix Transformer compat (#3) * revert quantizer_gptq.py change * pass **kwargs * add meta info * cleanup * cleanup * Update quantization_config.py * hf_select_quant_linear pass checkpoint_format and meta * fix GPTQTestCUDA * Update test_gptq.py * gptqmodel.hf_select_quant_linear() now does not select ExllamaV2 * cleanup * add backend * cleanup * cleanup * no need check exllama version * Update quantization_config.py * lower checkpoint_format and backend * check none * cleanup * Update quantization_config.py * fix self.use_exllama == False * spell * fix unittest * fix unittest --------- Co-authored-by: LRL Co-authored-by: Qubitium-ModelCloud --- src/transformers/utils/quantization_config.py | 45 ++++++++++++++----- tests/quantization/gptq/test_gptq.py | 43 +++++++++++++----- 2 files changed, 66 insertions(+), 22 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 9f0d55ff1a2446..18e6db6d6c02f3 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -25,7 +25,9 @@ from packaging import version -from ..utils import is_auto_awq_available, is_hqq_available, is_torch_available, is_torchao_available, logging +from .import_utils import is_auto_gptq_available +from ..utils import (is_auto_awq_available, is_hqq_available, is_torch_available, is_gptqmodel_available, + is_torchao_available, logging) if is_torch_available(): @@ -577,8 +579,14 @@ class GPTQConfig(QuantizationConfigMixin): quantization using inputs that have passed through the previously quantized layers. checkpoint_format (`str`, *optional*, defaults to `"gptq"`): GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only. + meta (`Dict[str, any]`, *optional*): + Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta. + i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"] + backend (`str`, *optional*): + Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only + valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py use_cuda_fp16 (`bool`, *optional*, defaults to `False`): - Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. + Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only. model_seqlen (`int`, *optional*): The maximum sequence length that the model can take. block_name_to_quantize (`str`, *optional*): @@ -618,7 +626,9 @@ def __init__( desc_act: bool = False, sym: bool = True, true_sequential: bool = True, - checkpoint_format: Optional[str] = "gptq", + checkpoint_format: str = "gptq", + meta: Optional[Dict[str, any]] = None, + backend: Optional[str] = None, use_cuda_fp16: bool = False, model_seqlen: Optional[int] = None, block_name_to_quantize: Optional[str] = None, @@ -641,6 +651,9 @@ def __init__( self.desc_act = desc_act self.sym = sym self.true_sequential = true_sequential + self.checkpoint_format = checkpoint_format.lower() + self.meta = meta + self.backend = backend.lower() if isinstance(backend, str) else backend self.use_cuda_fp16 = use_cuda_fp16 self.model_seqlen = model_seqlen self.block_name_to_quantize = block_name_to_quantize @@ -653,7 +666,6 @@ def __init__( self.disable_exllama = kwargs.pop("disable_exllama", None) self.cache_block_outputs = cache_block_outputs self.modules_in_block_to_quantize = modules_in_block_to_quantize - self.checkpoint_format = checkpoint_format self.post_init() def get_loading_attributes(self): @@ -690,6 +702,17 @@ def post_init(self): ['wikitext2','c4','c4-new'], but we found {self.dataset}""" ) + # make sure backend is back/forward compatible with both gptqmodel (full) and auto-gptq (partial) + if is_gptqmodel_available(): + # convert auto-gptq control into gptqmodel backend + if self.backend is None: + self.backend = "auto_trainable" if self.use_exllama == False else "auto" + else: + # convert gptqmodel backend `auto_trainable` into auto-gptq control + if self.backend == "auto_trainable": + self.use_exllama = False + + # auto-gptq specific kernel control logic if self.disable_exllama is None and self.use_exllama is None: # New default behaviour self.use_exllama = True @@ -723,12 +746,13 @@ def post_init(self): "speed using exllamav2 kernel by setting `exllama_config`." ) elif self.exllama_config["version"] == ExllamaVersion.TWO: - optimum_version = version.parse(importlib.metadata.version("optimum")) - autogptq_version = version.parse(importlib.metadata.version("auto_gptq")) - if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"): - raise ValueError( - f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}" - ) + if is_auto_gptq_available(): + optimum_version = version.parse(importlib.metadata.version("optimum")) + autogptq_version = version.parse(importlib.metadata.version("auto_gptq")) + if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"): + raise ValueError( + f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}" + ) if self.modules_in_block_to_quantize is not None: optimum_version = version.parse(importlib.metadata.version("optimum")) if optimum_version < version.parse("1.15.0"): @@ -736,6 +760,7 @@ def post_init(self): "You current version of `optimum` does not support `modules_in_block_to_quantize` quantization argument, please upgrade `optimum` package to a version superior than 1.15.0 ." ) + def to_dict(self): config_dict = super().to_dict() config_dict.pop("disable_exllama", None) diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 7dc410f2cd90ac..9f97508e243a9f 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -18,7 +18,7 @@ import pytest -from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AutoConfig from transformers.testing_utils import ( is_torch_available, require_accelerate, @@ -84,12 +84,14 @@ class GPTQTest(unittest.TestCase): input_text = "Hello my name is" EXPECTED_OUTPUTS = set() + # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello my name is Katie, I am a 22 year") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings EXPECTED_RELATIVE_DIFFERENCE = 2.06183008 bits = 4 + sym = True group_size = 128 desc_act = False use_exllama = False @@ -112,13 +114,15 @@ def setUpClass(cls): cls.mem_fp16 = cls.model_fp16.get_memory_footprint() cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True) + cls.config = AutoConfig.from_pretrained(cls.model_name) - quantization_config = GPTQConfig( + cls.quantization_config = GPTQConfig( bits=cls.bits, dataset=cls.dataset, tokenizer=cls.tokenizer, group_size=cls.group_size, desc_act=cls.desc_act, + sym=cls.sym, use_exllama=cls.use_exllama, ) @@ -126,7 +130,7 @@ def setUpClass(cls): cls.model_name, torch_dtype=torch.float16, device_map=cls.device_map, - quantization_config=quantization_config, + quantization_config=cls.quantization_config, ) def test_memory_footprint(self): @@ -167,14 +171,21 @@ def test_quantized_layers_class(self): """ if is_gptqmodel_available(): from gptqmodel.utils.importer import hf_select_quant_linear - + if hasattr(self.config, "quantization_config"): + checkpoint_format = self.config.quantization_config.get("checkpoint_format") + meta = self.config.quantization_config.get("meta") + else: + checkpoint_format = "gptq" + meta = None QuantLinear = hf_select_quant_linear( bits=self.bits, group_size=self.group_size, desc_act=self.desc_act, - sym=True, + sym=self.sym, device_map=self.device_map, - pack=False, + checkpoint_format=checkpoint_format, + meta=meta, + backend=self.quantization_config.backend, ) elif is_auto_gptq_available(): from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear @@ -187,7 +198,7 @@ def test_quantized_layers_class(self): disable_exllama=not self.use_exllama, disable_exllamav2=True, ) - self.assertTrue(self.quantized_model.model.layers[0].mlp.gate_proj.__class__ == QuantLinear) + self.assertEqual(self.quantized_model.model.layers[0].mlp.gate_proj.__class__, QuantLinear) def check_inference_correctness(self, model): r""" @@ -205,13 +216,13 @@ def check_inference_correctness(self, model): self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) def check_quantized_layers_type(self, model, value): - self.assertTrue(model.model.layers[0].mlp.gate_proj.QUANT_TYPE == value) + self.assertEqual(model.model.layers[0].mlp.gate_proj.QUANT_TYPE, value) def test_generate_quality(self): """ Simple test to check the quality of the model by comparing the generated tokens with the expected tokens """ - if self.device_map != "cpu": + if self.device_map is None: self.check_inference_correctness(self.quantized_model.to(0)) else: self.check_inference_correctness(self.quantized_model) @@ -235,7 +246,7 @@ def test_serialization(self): tmpdirname, device_map=self.device_map ) else: - quant_type = "ipex" if self.device_map == "cpu" else "cuda" + quant_type = "ipex" if self.device_map == "cpu" else "exllama" quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( tmpdirname, device_map=self.device_map ) @@ -259,6 +270,12 @@ class GPTQTestCUDA(GPTQTest): EXPECTED_RELATIVE_DIFFERENCE = 2.06183008 device_map = {"": 0} + @classmethod + def setUpClass(cls): + super().setUpClass() + # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions + cls.EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") + def test_change_loading_attributes(self): """ Test the serialization of the model and the loading of the quantized weights works with another config file @@ -302,6 +319,7 @@ class GPTQTestActOrderExllama(unittest.TestCase): """ EXPECTED_OUTPUTS = set() + # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.") # 4bit + act_order + 128g model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ" @@ -338,7 +356,7 @@ def check_inference_correctness(self, model): self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) def test_quantized_layers_type(self): - self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllama") + self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, "exllama") def test_generate_quality(self): """ @@ -377,6 +395,7 @@ class GPTQTestExllamaV2(unittest.TestCase): """ EXPECTED_OUTPUTS = set() + # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.") # 4bit + act_order + 128g model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ" @@ -397,7 +416,7 @@ def setUpClass(cls): cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True) def test_quantized_layers_type(self): - self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllamav2") + self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, "exllama" if is_gptqmodel_available() else "exllamav2") def check_inference_correctness(self, model): """ From 3972d2e75c4a8e5d104f212aeb997c446d46b7fa Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 10 Dec 2024 10:37:49 +0000 Subject: [PATCH 14/42] fix format Signed-off-by: jiqing-feng --- src/transformers/utils/quantization_config.py | 14 ++++++++++---- tests/quantization/gptq/test_gptq.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 18e6db6d6c02f3..6ec865d2803cd2 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -25,9 +25,15 @@ from packaging import version +from ..utils import ( + is_auto_awq_available, + is_gptqmodel_available, + is_hqq_available, + is_torch_available, + is_torchao_available, + logging, +) from .import_utils import is_auto_gptq_available -from ..utils import (is_auto_awq_available, is_hqq_available, is_torch_available, is_gptqmodel_available, - is_torchao_available, logging) if is_torch_available(): @@ -583,7 +589,7 @@ class GPTQConfig(QuantizationConfigMixin): Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta. i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"] backend (`str`, *optional*): - Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only + Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py use_cuda_fp16 (`bool`, *optional*, defaults to `False`): Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only. @@ -706,7 +712,7 @@ def post_init(self): if is_gptqmodel_available(): # convert auto-gptq control into gptqmodel backend if self.backend is None: - self.backend = "auto_trainable" if self.use_exllama == False else "auto" + self.backend = "auto_trainable" if not self.use_exllama else "auto" else: # convert gptqmodel backend `auto_trainable` into auto-gptq control if self.backend == "auto_trainable": diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 9f97508e243a9f..5d7fdb1bc91107 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -18,7 +18,7 @@ import pytest -from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AutoConfig +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GPTQConfig from transformers.testing_utils import ( is_torch_available, require_accelerate, From 99b2ed76278a01e947f05636abb08247d6a77117 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 10 Dec 2024 10:45:03 +0000 Subject: [PATCH 15/42] fix format again Signed-off-by: jiqing-feng --- src/transformers/utils/quantization_config.py | 1 - tests/quantization/gptq/test_gptq.py | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 6ec865d2803cd2..b1680a674cec6f 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -766,7 +766,6 @@ def post_init(self): "You current version of `optimum` does not support `modules_in_block_to_quantize` quantization argument, please upgrade `optimum` package to a version superior than 1.15.0 ." ) - def to_dict(self): config_dict = super().to_dict() config_dict.pop("disable_exllama", None) diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 5d7fdb1bc91107..846a8791ddb3c8 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -171,6 +171,7 @@ def test_quantized_layers_class(self): """ if is_gptqmodel_available(): from gptqmodel.utils.importer import hf_select_quant_linear + if hasattr(self.config, "quantization_config"): checkpoint_format = self.config.quantization_config.get("checkpoint_format") meta = self.config.quantization_config.get("meta") @@ -416,7 +417,10 @@ def setUpClass(cls): cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True) def test_quantized_layers_type(self): - self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, "exllama" if is_gptqmodel_available() else "exllamav2") + self.assertEqual( + self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, + "exllama" if is_gptqmodel_available() else "exllamav2", + ) def check_inference_correctness(self, model): """ From ac14b9f41b8ca8531234adb717a98aa37808365e Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Date: Mon, 16 Dec 2024 18:31:01 +0800 Subject: [PATCH 16/42] update gptqmodel version (#6) * update gptqmodel version * update gptqmodel version --- docs/source/en/quantization/gptq.md | 2 +- src/transformers/quantizers/quantizer_gptq.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index dbbc95e7c1c59e..350f680456f34f 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -119,7 +119,7 @@ Only 4-bit models are supported, and we recommend deactivating the ExLlama kerne -The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2) or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. +The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2) or GPTQModel (version > 1.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. ```py import torch diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index a6ae314da0719e..8051461c737270 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -70,13 +70,13 @@ def validate_environment(self, *args, **kwargs): "0.4.2" ): raise ImportError( - "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel`. Please notice that auto-gptq will be deprecated in the future." + "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.2`. Please notice that auto-gptq will be deprecated in the future." ) elif is_gptqmodel_available() and ( - version.parse(importlib.metadata.version("gptqmodel")) <= version.parse("1.3.1") + version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.2") or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99") ): - raise ImportError("The gptqmodel version should be >= 1.3.2, optimum version should >= 1.24.0") + raise ImportError("The gptqmodel version should be >= 1.4.2, optimum version should >= 1.24.0") def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": if torch_dtype is None: From 0276854bfae0f4935740e620a55437f27da5be79 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Date: Thu, 19 Dec 2024 18:20:14 +0800 Subject: [PATCH 17/42] fix unit test (#5) * update gptqmodel version * update gptqmodel version * "not self.use_exllama" is not equivalent to "self.use_exllama==False" * fix unittest * update gptqmodel version --- src/transformers/quantizers/quantizer_gptq.py | 6 +++--- src/transformers/utils/quantization_config.py | 2 +- tests/quantization/gptq/test_gptq.py | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 8051461c737270..f96754757995fa 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -70,13 +70,13 @@ def validate_environment(self, *args, **kwargs): "0.4.2" ): raise ImportError( - "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.2`. Please notice that auto-gptq will be deprecated in the future." + "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`. Please notice that auto-gptq will be deprecated in the future." ) elif is_gptqmodel_available() and ( - version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.2") + version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3") or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99") ): - raise ImportError("The gptqmodel version should be >= 1.4.2, optimum version should >= 1.24.0") + raise ImportError("The gptqmodel version should be >= 1.4.3, optimum version should >= 1.24.0") def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": if torch_dtype is None: diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index b1680a674cec6f..f4c7a1ad3507f4 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -712,7 +712,7 @@ def post_init(self): if is_gptqmodel_available(): # convert auto-gptq control into gptqmodel backend if self.backend is None: - self.backend = "auto_trainable" if not self.use_exllama else "auto" + self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto" else: # convert gptqmodel backend `auto_trainable` into auto-gptq control if self.backend == "auto_trainable": diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 846a8791ddb3c8..6f73e0feaba785 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -86,6 +86,7 @@ class GPTQTest(unittest.TestCase): EXPECTED_OUTPUTS = set() # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello my name is Katie, I am a 22 year") + EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings EXPECTED_RELATIVE_DIFFERENCE = 2.06183008 @@ -226,6 +227,8 @@ def test_generate_quality(self): if self.device_map is None: self.check_inference_correctness(self.quantized_model.to(0)) else: + if self.device_map == "cpu" and self.quantized_model.device.type != "cpu": + self.quantized_model.to("cpu") self.check_inference_correctness(self.quantized_model) def test_serialization(self): From 4ffc7d1c7b5e7453cfdcb2258232e5a703ec4618 Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud <165116337+LRL-ModelCloud@users.noreply.github.com> Date: Fri, 20 Dec 2024 08:57:50 +0800 Subject: [PATCH 18/42] backend is loading_attibutes (#7) --- src/transformers/utils/quantization_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 763eeec2e9d4df..6cbaa67960d798 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -676,7 +676,7 @@ def __init__( def get_loading_attributes(self): attibutes_dict = copy.deepcopy(self.__dict__) - loading_attibutes = ["disable_exllama", "use_exllama", "exllama_config", "use_cuda_fp16", "max_input_length"] + loading_attibutes = ["disable_exllama", "use_exllama", "exllama_config", "use_cuda_fp16", "max_input_length", "backend"] loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes} return loading_attibutes_dict From 5474f89864bd991b086969bac9d2fc83d02b0bff Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 20 Dec 2024 09:22:05 +0000 Subject: [PATCH 19/42] fix format and tests Signed-off-by: jiqing-feng --- src/transformers/utils/quantization_config.py | 9 ++++++++- tests/quantization/gptq/test_gptq.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 6cbaa67960d798..c173832f320b3e 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -676,7 +676,14 @@ def __init__( def get_loading_attributes(self): attibutes_dict = copy.deepcopy(self.__dict__) - loading_attibutes = ["disable_exllama", "use_exllama", "exllama_config", "use_cuda_fp16", "max_input_length", "backend"] + loading_attibutes = [ + "disable_exllama", + "use_exllama", + "exllama_config", + "use_cuda_fp16", + "max_input_length", + "backend", + ] loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes} return loading_attibutes_dict diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 6f73e0feaba785..958b905b008e34 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -89,7 +89,7 @@ class GPTQTest(unittest.TestCase): EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings - EXPECTED_RELATIVE_DIFFERENCE = 2.06183008 + EXPECTED_RELATIVE_DIFFERENCE = 2.06184043 bits = 4 sym = True From 99b5f145270f7d94a90835026441ab15b8ed65d2 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 20 Dec 2024 12:10:25 +0000 Subject: [PATCH 20/42] fix memory check Signed-off-by: jiqing-feng --- tests/quantization/gptq/test_gptq.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 958b905b008e34..20cfd12d102f8a 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -89,7 +89,7 @@ class GPTQTest(unittest.TestCase): EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings - EXPECTED_RELATIVE_DIFFERENCE = 2.06184043 + EXPECTED_RELATIVE_DIFFERENCE = 2.0618 bits = 4 sym = True @@ -142,7 +142,7 @@ def test_memory_footprint(self): mem_quantized = self.quantized_model.get_memory_footprint() - self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE) + self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE, places=4) def test_device_and_dtype_assignment(self): r""" @@ -271,7 +271,6 @@ def test_serialization_big_model_inference(self): @require_torch_gpu class GPTQTestCUDA(GPTQTest): - EXPECTED_RELATIVE_DIFFERENCE = 2.06183008 device_map = {"": 0} @classmethod From 409f6a2b5635afd28cca3b18cb735c6085242b94 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 23 Dec 2024 10:00:17 -0500 Subject: [PATCH 21/42] fix device mismatch Signed-off-by: jiqing-feng --- src/transformers/quantizers/quantizer_gptq.py | 4 ++++ tests/quantization/gptq/test_gptq.py | 1 + 2 files changed, 5 insertions(+) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index f96754757995fa..6bff624cc65242 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -93,6 +93,10 @@ def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwarg model = self.optimum_quantizer.convert_model(model, **kwargs) def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + # Only with auto-gptq do not support CPU, we should move the model to cuda if available. + if model.device.type == "cpu" and not is_gptqmodel_available() and torch.cuda.is_available(): + model = model.to(0) + model.hf_device_map = {"": 0} if self.pre_quantized: model = self.optimum_quantizer.post_init_model(model) else: diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 20cfd12d102f8a..e1c457d958e6c7 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -87,6 +87,7 @@ class GPTQTest(unittest.TestCase): # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello my name is Katie, I am a 22 year") EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") + EXPECTED_OUTPUTS.add("Hello my name is Kyle. I am a 22 year") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings EXPECTED_RELATIVE_DIFFERENCE = 2.0618 From c996a4158cccdbc00a470c7687f0afebd596b2c1 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 23 Dec 2024 17:50:26 -0500 Subject: [PATCH 22/42] fix result check Signed-off-by: jiqing-feng --- tests/quantization/gptq/test_gptq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index e1c457d958e6c7..43bccac6cd830c 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -88,6 +88,7 @@ class GPTQTest(unittest.TestCase): EXPECTED_OUTPUTS.add("Hello my name is Katie, I am a 22 year") EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") EXPECTED_OUTPUTS.add("Hello my name is Kyle. I am a 22 year") + EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 22 year") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings EXPECTED_RELATIVE_DIFFERENCE = 2.0618 From dbf68e86de25cf1c17c4d2eab48a04545027b52f Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 24 Dec 2024 09:08:08 +0800 Subject: [PATCH 23/42] Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/quantizers/quantizer_gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 6bff624cc65242..fd4c3fbceecbe0 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -53,7 +53,7 @@ def validate_environment(self, *args, **kwargs): raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)") if is_auto_gptq_available() and is_gptqmodel_available(): logger.warning( - "Detected gptqmodel and auto-gptq, will use gptqmodel, auto-gptq will be deprecated in the future." + "Detected gptqmodel and auto-gptq, will use gptqmodel" ) gptq_supports_cpu = ( From f4c2ad3d8c7356cbbd71e62816edf741dd784dc5 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 24 Dec 2024 09:08:23 +0800 Subject: [PATCH 24/42] Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/quantizers/quantizer_gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index fd4c3fbceecbe0..953432686e91f1 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -70,7 +70,7 @@ def validate_environment(self, *args, **kwargs): "0.4.2" ): raise ImportError( - "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`. Please notice that auto-gptq will be deprecated in the future." + "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`." ) elif is_gptqmodel_available() and ( version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3") From 9185f8badf3483a72db05607c75e89dbd718d399 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 24 Dec 2024 09:08:42 +0800 Subject: [PATCH 25/42] Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/quantizers/quantizer_gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index 953432686e91f1..0f401a80f716a1 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -64,7 +64,7 @@ def validate_environment(self, *args, **kwargs): raise RuntimeError("GPU is required to quantize or run quantize model.") elif not (is_auto_gptq_available() or is_gptqmodel_available()): raise ImportError( - "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. Please notice that auto-gptq will be deprecated in the future." + "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. " ) elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse( "0.4.2" From 65ee44bf86712e1d32c5f1c622886d458f741052 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 24 Dec 2024 06:53:09 -0500 Subject: [PATCH 26/42] update tests Signed-off-by: jiqing-feng --- src/transformers/quantizers/quantizer_gptq.py | 16 ++-- tests/quantization/gptq/test_gptq.py | 87 +++++++++---------- 2 files changed, 51 insertions(+), 52 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index fe7e9171cd6b55..98f2fc685054cd 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -52,9 +52,7 @@ def validate_environment(self, *args, **kwargs): if not is_optimum_available(): raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)") if is_auto_gptq_available() and is_gptqmodel_available(): - logger.warning( - "Detected gptqmodel and auto-gptq, will use gptqmodel" - ) + logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel") gptq_supports_cpu = ( is_auto_gptq_available() @@ -86,6 +84,14 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.") return torch_dtype + def update_device_map(self, device_map): + if device_map is None: + device_map = {"": torch.device("cpu")} + # Only with auto-gptq do not support CPU, we should move the model to cuda if available. + if not is_gptqmodel_available() and device_map in ("cpu", {"": torch.device("cpu")}): + device_map == {"": 0} + return device_map + def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs): if model.__class__.main_input_name != "input_ids": raise RuntimeError("We can only quantize pure text model.") @@ -94,10 +100,6 @@ def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwarg model = self.optimum_quantizer.convert_model(model, **kwargs) def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): - # Only with auto-gptq do not support CPU, we should move the model to cuda if available. - if model.device.type == "cpu" and not is_gptqmodel_available() and torch.cuda.is_available(): - model = model.to(0) - model.hf_device_map = {"": 0} if self.pre_quantized: model = self.optimum_quantizer.post_init_model(model) else: diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 43bccac6cd830c..6aac2114dd2a98 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -79,19 +79,21 @@ def test_optimum_config(self): @require_optimum @require_gptq class GPTQTest(unittest.TestCase): - model_name = "Felladrin/Llama-160M-Chat-v1" + model_name = "bigscience/bloom-560m" input_text = "Hello my name is" EXPECTED_OUTPUTS = set() # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions - EXPECTED_OUTPUTS.add("Hello my name is Katie, I am a 22 year") - EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") - EXPECTED_OUTPUTS.add("Hello my name is Kyle. I am a 22 year") - EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 22 year") + EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I") + EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") + EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of") + EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.") + EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the") + EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings - EXPECTED_RELATIVE_DIFFERENCE = 2.0618 + EXPECTED_RELATIVE_DIFFERENCE = 1.664253062 bits = 4 sym = True @@ -103,7 +105,7 @@ class GPTQTest(unittest.TestCase): "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." ] - device_map = "cpu" + device_map = "cpu" if is_gptqmodel_available() else None # called only once for all test in this class @classmethod @@ -152,7 +154,7 @@ def test_device_and_dtype_assignment(self): Checks also if other models are casted correctly. """ # This should work - if self.device_map == "cpu" and torch.cuda.is_available(): + if self.device_map in (None, "cpu"): _ = self.quantized_model.to(0) with self.assertRaises(ValueError): @@ -202,7 +204,7 @@ def test_quantized_layers_class(self): disable_exllama=not self.use_exllama, disable_exllamav2=True, ) - self.assertEqual(self.quantized_model.model.layers[0].mlp.gate_proj.__class__, QuantLinear) + self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) def check_inference_correctness(self, model): r""" @@ -220,7 +222,7 @@ def check_inference_correctness(self, model): self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) def check_quantized_layers_type(self, model, value): - self.assertEqual(model.model.layers[0].mlp.gate_proj.QUANT_TYPE, value) + self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value) def test_generate_quality(self): """ @@ -267,7 +269,8 @@ def test_serialization_big_model_inference(self): """ with tempfile.TemporaryDirectory() as tmpdirname: self.quantized_model.save_pretrained(tmpdirname) - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map) + device_map = self.device_map or "auto" + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=device_map) self.check_inference_correctness(quantized_model_from_saved) @@ -275,12 +278,6 @@ def test_serialization_big_model_inference(self): class GPTQTestCUDA(GPTQTest): device_map = {"": 0} - @classmethod - def setUpClass(cls): - super().setUpClass() - # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions - cls.EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year") - def test_change_loading_attributes(self): """ Test the serialization of the model and the loading of the quantized weights works with another config file @@ -361,7 +358,7 @@ def check_inference_correctness(self, model): self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) def test_quantized_layers_type(self): - self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, "exllama") + self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllama") def test_generate_quality(self): """ @@ -455,32 +452,32 @@ def test_generate_quality(self): @require_torch_multi_gpu class GPTQTestDeviceMapCPUOffload(GPTQTest): device_map = { - "model.embed_tokens": 0, - "model.norm": 0, + "transformer.word_embeddings": 0, + "transformer.word_embeddings_layernorm": 0, "lm_head": 0, - "model.layer.0": 0, - "model.layer.1": 0, - "model.layer.2": 0, - "model.layer.3": 0, - "model.layer.4": 0, - "model.layer.5": 0, - "model.layer.6": 0, - "model.layer.7": 0, - "model.layer.8": 0, - "model.layer.9": 0, - "model.layer.10": 1, - "model.layer.11": 1, - "model.layer.12": 1, - "model.layer.13": 1, - "model.layer.14": 1, - "model.layer.15": 1, - "model.layer.16": 1, - "model.layer.17": 0, - "model.layer.18": "cpu", - "model.layer.19": "cpu", - "model.layer.20": "cpu", - "model.layer.21": "cpu", - "model.layer.22": "cpu", - "model.layer.23": 1, - "model.rotary_emb": 0, + "transformer.h.0": 0, + "transformer.h.1": 0, + "transformer.h.2": 0, + "transformer.h.3": 0, + "transformer.h.4": 0, + "transformer.h.5": 0, + "transformer.h.6": 0, + "transformer.h.7": 0, + "transformer.h.8": 0, + "transformer.h.9": 0, + "transformer.h.10": 1, + "transformer.h.11": 1, + "transformer.h.12": 1, + "transformer.h.13": 1, + "transformer.h.14": 1, + "transformer.h.15": 1, + "transformer.h.16": 1, + "transformer.h.17": 0, + "transformer.h.18": "cpu", + "transformer.h.19": "cpu", + "transformer.h.20": "cpu", + "transformer.h.21": "cpu", + "transformer.h.22": "cpu", + "transformer.h.23": 1, + "transformer.ln_f": 0, } From b270b2d82304bea86fae44fca3c00b8c95cc6e82 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 24 Dec 2024 08:06:16 -0500 Subject: [PATCH 27/42] update tests for gptqmodel Signed-off-by: jiqing-feng --- tests/quantization/gptq/test_gptq.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 6aac2114dd2a98..c0056b23866338 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -28,7 +28,7 @@ require_torch_multi_gpu, slow, ) -from transformers.utils import is_auto_gptq_available, is_gptqmodel_available +from transformers.utils import is_auto_gptq_available, is_gptqmodel_available, is_ipex_available if is_torch_available(): @@ -91,6 +91,9 @@ class GPTQTest(unittest.TestCase): EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.") EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the") EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,") + EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University") + EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N") + EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings EXPECTED_RELATIVE_DIFFERENCE = 1.664253062 @@ -254,7 +257,10 @@ def test_serialization(self): tmpdirname, device_map=self.device_map ) else: - quant_type = "ipex" if self.device_map == "cpu" else "exllama" + if self.device_map == "cpu": + quant_type = "ipex" if is_ipex_available() else "torch" + else: + quant_type = "exllama" quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( tmpdirname, device_map=self.device_map ) From 7120899cc229542fdcd8247e13307df85cbe8195 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Date: Tue, 24 Dec 2024 12:13:27 +0800 Subject: [PATCH 28/42] update document (#9) * update overview.md * cleanup * Update overview.md * Update overview.md * Update overview.md * update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md --------- Co-authored-by: Qubitium-ModelCloud --- docs/source/en/quantization/gptq.md | 42 ++++++++++++++----- docs/source/en/quantization/overview.md | 56 ++++++++++++++++--------- 2 files changed, 69 insertions(+), 29 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 350f680456f34f..fb5b365801799b 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -22,22 +22,39 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google -The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate. +Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory-usage by almost 4x because the int4 weights are often dequantized in a fused kernel. One can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth. -Now, we are going to replace [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) with [GPTQModel](https://github.com/ModelCloud/GPTQModel), the auto_gptq will be deprecated in the future. +[GPTQModel](https://github.com/ModelCloud/GPTQModel) has its origin as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences: -Before you begin, make sure the following libraries are installed: +* Model support: GPTQModel continues to support all of the latest released LLM models. +* Multi-Modal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. +* Platform support: Validated MacOS Apple Silicone and Windows 11 support. +* Hardware support: Apple silicone M1+, Intel/AMD CPU, and Intel Datacetner Max + Arc GPUs. +* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max + ARc) support. +* Updated Marlin kernel from Neural Magic that is higly optimized for A100 +* Updated Kernels with auto-padding for legacy model support and models with non-uniform in/out-features. +* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization apis. +* User and developer friendly apis. + + +[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) will likely be deprecated in the future due the lack of continued support for new models and features. + +Before you begin, make sure the following libraries are installed and updated to the latest release: ```bash -pip install auto-gptq +pip install --upgrade accelerate optimum transformers ``` -or + +Then install either GPTQModel or AutoGPTQ. + ```bash -pip install gptqmodel +pip install gptqmodel --no-build-isolation ``` +or + ```bash -pip install --upgrade accelerate optimum transformers +pip install auto-gptq --no-build-isolation ``` To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset. @@ -101,9 +118,14 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto") ``` +## Marlin + +[Marlin](https://github.com/IST-DASLab/marlin) is a CUDA gptq kernel, 4-bit only, that is highly optimized for the Nvidia A100 GPU (Ampere) architecture where the the loading, dequantization, and execution of post-dequantized weights are highly parallelized offering a substantial inference improvement versus the original CUDA gptq kernel. Marlin is only available for quantized inference and does support model quantization. + + ## ExLlama -[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter: +[ExLlama](https://github.com/turboderp/exllama) is a CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter: ```py import torch @@ -119,11 +141,11 @@ Only 4-bit models are supported, and we recommend deactivating the ExLlama kerne -The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2) or GPTQModel (version > 1.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. +The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. ```py import torch from transformers import AutoModelForCausalLM, GPTQConfig gptq_config = GPTQConfig(bits=4, use_exllama=False) model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config) -``` \ No newline at end of file +``` diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index b85cb8542a363c..2c9b2babb0785f 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -45,38 +45,56 @@ In short, supporting a wide range of quantization methods allows you to pick the Use the table below to help you decide which quantization method to use. -| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with πŸ€— transformers | πŸ€— transformers support | Link to library | -|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| -| [AQLM](./aqlm) | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 2 | 🟒 | 🟒 | 🟒 | https://github.com/Vahe1994/AQLM | -| [AWQ](./awq) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | ? | 4 | 🟒 | 🟒 | 🟒 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes) | 🟒 | 🟑 * | 🟒 | 🟑 * | πŸ”΄ ** | 🟑 * | πŸ”΄ (soon!) | 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/bitsandbytes-foundation/bitsandbytes | -| [compressed-tensors](./compressed_tensors) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 1 - 8 | 🟒 | 🟒 | 🟒 | https://github.com/neuralmagic/compressed-tensors | -| [EETQ](./eetq) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | ? | 8 | 🟒 | 🟒 | 🟒 | https://github.com/NetEase-FuXi/EETQ | -| GGUF / GGML (llama.cpp) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | 1 - 8 | πŸ”΄ | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp | -| [GPTQ](./gptq) | πŸ”΄ | 🟑 *** | 🟒 | 🟒 | πŸ”΄ | 🟑 *** | πŸ”΄ | 2 - 3 - 4 - 8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HIGGS](./higgs) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 2 - 4 | πŸ”΄ | 🟒 | 🟒 | https://github.com/HanGuo97/flute | -| [HQQ](./hqq) | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 - 8 | 🟒 | πŸ”΄ | 🟒 | https://github.com/mobiusml/hqq/ | -| [optimum-quanto](./quanto) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | 🟒 | 2 / 4 / 8 | πŸ”΄ | πŸ”΄ | 🟒 | https://github.com/huggingface/optimum-quanto | -| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | πŸ”΄ | 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/pytorch/FBGEMM | -| [torchao](./torchao.md) | 🟒 | | 🟒 | πŸ”΄ | partial support (int4 weight only) | πŸ”΄ | | 4 / 8 | | πŸŸ’πŸ”΄ | 🟒 | https://github.com/pytorch/ao | -| [VPTQ](./vptq) | πŸ”΄ | πŸ”΄ | 🟒 | 🟑 | πŸ”΄ | πŸ”΄ | 🟒 | 1 - 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/microsoft/VPTQ | +| Quantization method | On the fly quantization | CPU | CUDA GPU | ROCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() | Number of bits | Supports fine-tuning (through PEFT) | Serializable with πŸ€— transformers | πŸ€— transformers support | Link to library | +|--------------------------------------------|-------------------------|-----------------|----------|-----------------|------------------------------------|-----------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| +| [AQLM](./aqlm.md) | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 2 | 🟒 | 🟒 | 🟒 | https://github.com/Vahe1994/AQLM | +| [AWQ](./awq.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | ? | 4 | 🟒 | 🟒 | 🟒 | https://github.com/casper-hansen/AutoAWQ | +| [bitsandbytes](./bitsandbytes.md) | 🟒 | 🟑 1 | 🟒 | 🟑 1 | πŸ”΄ 2 | 🟑 1 | πŸ”΄ 1 | 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [compressed-tensors](./compressed_tensors.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 1 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/neuralmagic/compressed-tensors | +| [EETQ](./eetq.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | ? | 8 | 🟒 | 🟒 | 🟒 | https://github.com/NetEase-FuXi/EETQ | +| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | 1 / 8 | πŸ”΄ | πŸ”΄ 6 | πŸ”΄ 6 | https://github.com/ggerganov/llama.cpp | +| [GPTQModel](./gptq.md) | πŸ”΄ | 🟒 3 | 🟒 | 🟒 | 🟒 | 🟒 4 | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/ModelCloud/GPTQModel | +| [AutoGPTQ](./gptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | +| [HIGGS](./higgs.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 2 / 4 | πŸ”΄ | 🟒 | 🟒 | https://github.com/HanGuo97/flute | +| [HQQ](./hqq.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | 🟒 | πŸ”΄ | 🟒 | https://github.com/mobiusml/hqq/ | +| [optimum-quanto](./quanto.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | 🟒 | 2 / 4 / 8 | πŸ”΄ | πŸ”΄ | 🟒 | https://github.com/huggingface/optimum-quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | πŸ”΄ | 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/pytorch/FBGEMM | +| [torchao](./torchao.md) | 🟒 | | 🟒 | πŸ”΄ | 🟑 5 | πŸ”΄ | | 4 / 8 | | πŸŸ’πŸ”΄ | 🟒 | https://github.com/pytorch/ao | +| [VPTQ](./vptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟑 | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/microsoft/VPTQ | + +**1** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. -\* bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). + + + + +**2** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. + + -We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. + + +**3** GPTQModel[CPU] supports full bit range via Torch and 4-bit via IPEX on Intel/AMD. -\** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. +**4** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max + Arc. -\*** GPTQ only supports 4-bit on Intel CPU / GPU. +**5** torchao only supports int4 weight on Metal (Apple Silicon). + + + + +**6** [See GGUF section](../gguf.md) + + From 34d0ec0623e6258dcf6dac85bc1864d334419af5 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 20:09:55 +0800 Subject: [PATCH 29/42] review: update docs (#10) --- docs/source/en/quantization/overview.md | 48 +++++++++++-------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 2c9b2babb0785f..08a4d719ece889 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -45,56 +45,50 @@ In short, supporting a wide range of quantization methods allows you to pick the Use the table below to help you decide which quantization method to use. -| Quantization method | On the fly quantization | CPU | CUDA GPU | ROCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() | Number of bits | Supports fine-tuning (through PEFT) | Serializable with πŸ€— transformers | πŸ€— transformers support | Link to library | -|--------------------------------------------|-------------------------|-----------------|----------|-----------------|------------------------------------|-----------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| -| [AQLM](./aqlm.md) | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 2 | 🟒 | 🟒 | 🟒 | https://github.com/Vahe1994/AQLM | -| [AWQ](./awq.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | ? | 4 | 🟒 | 🟒 | 🟒 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes.md) | 🟒 | 🟑 1 | 🟒 | 🟑 1 | πŸ”΄ 2 | 🟑 1 | πŸ”΄ 1 | 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/bitsandbytes-foundation/bitsandbytes | -| [compressed-tensors](./compressed_tensors.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 1 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/neuralmagic/compressed-tensors | -| [EETQ](./eetq.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | ? | 8 | 🟒 | 🟒 | 🟒 | https://github.com/NetEase-FuXi/EETQ | -| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | 1 / 8 | πŸ”΄ | πŸ”΄ 6 | πŸ”΄ 6 | https://github.com/ggerganov/llama.cpp | -| [GPTQModel](./gptq.md) | πŸ”΄ | 🟒 3 | 🟒 | 🟒 | 🟒 | 🟒 4 | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/ModelCloud/GPTQModel | -| [AutoGPTQ](./gptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HIGGS](./higgs.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 2 / 4 | πŸ”΄ | 🟒 | 🟒 | https://github.com/HanGuo97/flute | -| [HQQ](./hqq.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | 🟒 | πŸ”΄ | 🟒 | https://github.com/mobiusml/hqq/ | -| [optimum-quanto](./quanto.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | 🟒 | 2 / 4 / 8 | πŸ”΄ | πŸ”΄ | 🟒 | https://github.com/huggingface/optimum-quanto | -| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | πŸ”΄ | 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/pytorch/FBGEMM | -| [torchao](./torchao.md) | 🟒 | | 🟒 | πŸ”΄ | 🟑 5 | πŸ”΄ | | 4 / 8 | | πŸŸ’πŸ”΄ | 🟒 | https://github.com/pytorch/ao | -| [VPTQ](./vptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟑 | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/microsoft/VPTQ | +| Quantization Method | Runtime Quantization | CPU | CUDA GPU | ROCm GPU | Metal (Apple Silicon) | Intel GPU | Torch compile() | Bits | PEFT Fine Tuning | Serializable with πŸ€—Transformers | πŸ€—Transformers Support | Link to library | +|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------| +| [AQLM](./aqlm.md) | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 2 | 🟒 | 🟒 | 🟒 | https://github.com/Vahe1994/AQLM | +| [AWQ](./awq.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | ? | 4 | 🟒 | 🟒 | 🟒 | https://github.com/casper-hansen/AutoAWQ | +| [bitsandbytes](./bitsandbytes.md) | 🟒 | 🟑 1 | 🟒 | 🟑 1 | πŸ”΄ 2 | 🟑 1 | πŸ”΄ 1 | 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [compressed-tensors](./compressed_tensors.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 1 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/neuralmagic/compressed-tensors | +| [EETQ](./eetq.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | ? | 8 | 🟒 | 🟒 | 🟒 | https://github.com/NetEase-FuXi/EETQ | +| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | 1 / 8 | πŸ”΄ | [See Notes](../gguf.md) | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp | +| [GPTQModel](./gptq.md) | πŸ”΄ | 🟒 3 | 🟒 | 🟒 | 🟒 | 🟒 4 | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/ModelCloud/GPTQModel | +| [AutoGPTQ](./gptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | +| [HIGGS](./higgs.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 2 / 4 | πŸ”΄ | 🟒 | 🟒 | https://github.com/HanGuo97/flute | +| [HQQ](./hqq.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | 🟒 | πŸ”΄ | 🟒 | https://github.com/mobiusml/hqq/ | +| [optimum-quanto](./quanto.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | 🟒 | 2 / 4 / 8 | πŸ”΄ | πŸ”΄ | 🟒 | https://github.com/huggingface/optimum-quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | πŸ”΄ | 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/pytorch/FBGEMM | +| [torchao](./torchao.md) | 🟒 | | 🟒 | πŸ”΄ | 🟑 5 | πŸ”΄ | | 4 / 8 | | πŸŸ’πŸ”΄ | 🟒 | https://github.com/pytorch/ao | +| [VPTQ](./vptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟑 | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/microsoft/VPTQ | -**1** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. +**1:** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. -**2** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. +**2:** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. -**3** GPTQModel[CPU] supports full bit range via Torch and 4-bit via IPEX on Intel/AMD. +**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/Amd/Apple Silicon. -**4** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max + Arc. +**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max + Arc. -**5** torchao only supports int4 weight on Metal (Apple Silicon). +**5:** torchao only supports int4 weight on Metal (Apple Silicon). - - - -**6** [See GGUF section](../gguf.md) - - From 153121aee06ab5783f2409cabe33ec553e03be2d Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 21:05:24 +0800 Subject: [PATCH 30/42] review: update docs (#12) * review: update docs * fix typo --- docs/source/en/quantization/gptq.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index fb5b365801799b..fef2265dd9e104 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -122,6 +122,14 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de [Marlin](https://github.com/IST-DASLab/marlin) is a CUDA gptq kernel, 4-bit only, that is highly optimized for the Nvidia A100 GPU (Ampere) architecture where the the loading, dequantization, and execution of post-dequantized weights are highly parallelized offering a substantial inference improvement versus the original CUDA gptq kernel. Marlin is only available for quantized inference and does support model quantization. +Marlin inference can be activated via the `backend` property in `GPTQConfig` for GPTQModel: + +```py + +from transformers import AutoModelForCausalLM, GPTQConfig + +model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=GPTQConfig(bits=4, backend="marlin")) +``` ## ExLlama From 8e36a0e9b4eb71819d0f40cc255bb60b6454e5e8 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 21:25:29 +0800 Subject: [PATCH 31/42] typo --- docs/source/en/quantization/gptq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index fef2265dd9e104..4fc12f6d1096b8 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -120,7 +120,7 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de ## Marlin -[Marlin](https://github.com/IST-DASLab/marlin) is a CUDA gptq kernel, 4-bit only, that is highly optimized for the Nvidia A100 GPU (Ampere) architecture where the the loading, dequantization, and execution of post-dequantized weights are highly parallelized offering a substantial inference improvement versus the original CUDA gptq kernel. Marlin is only available for quantized inference and does support model quantization. +[Marlin](https://github.com/IST-DASLab/marlin) is a CUDA gptq kernel, 4-bit only, that is highly optimized for the Nvidia A100 GPU (Ampere) architecture where the the loading, dequantization, and execution of post-dequantized weights are highly parallelized offering a substantial inference improvement versus the original CUDA gptq kernel. Marlin is only available for quantized inference and does not support model quantization. Marlin inference can be activated via the `backend` property in `GPTQConfig` for GPTQModel: From 0aef2df74a3a7676a1878d97146eedb1ff3875f2 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 23:35:09 +0800 Subject: [PATCH 32/42] doc note for asymmetric quant --- docs/source/en/quantization/gptq.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 4fc12f6d1096b8..01da44177fe275 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -29,7 +29,8 @@ Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https:/ * Model support: GPTQModel continues to support all of the latest released LLM models. * Multi-Modal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. * Platform support: Validated MacOS Apple Silicone and Windows 11 support. -* Hardware support: Apple silicone M1+, Intel/AMD CPU, and Intel Datacetner Max + Arc GPUs. +* Hardware support: Apple silicone M1+, Intel/AMD CPU, and Intel Datacenter Max + Arc GPUs. +* Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization. * IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max + ARc) support. * Updated Marlin kernel from Neural Magic that is higly optimized for A100 * Updated Kernels with auto-padding for legacy model support and models with non-uniform in/out-features. From 31a6baaa2ce00e5fa4d86024f9060042d03693b5 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 23:58:40 +0800 Subject: [PATCH 33/42] typo with apple silicon(e) --- docs/source/en/quantization/gptq.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 01da44177fe275..79fcdd707c2ab4 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -28,8 +28,8 @@ Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https:/ * Model support: GPTQModel continues to support all of the latest released LLM models. * Multi-Modal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. -* Platform support: Validated MacOS Apple Silicone and Windows 11 support. -* Hardware support: Apple silicone M1+, Intel/AMD CPU, and Intel Datacenter Max + Arc GPUs. +* Platform support: Validated MacOS Apple Silicon and Windows 11 support. +* Hardware support: Apple Silicon M1+, Intel/AMD CPU, and Intel Datacenter Max + Arc GPUs. * Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization. * IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max + ARc) support. * Updated Marlin kernel from Neural Magic that is higly optimized for A100 From d7c889020444f0fb3c036cd4f7eaeb039cbc502b Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Wed, 25 Dec 2024 00:03:04 +0800 Subject: [PATCH 34/42] typo for marlin --- docs/source/en/quantization/gptq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 79fcdd707c2ab4..6565eb2410fce3 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -32,7 +32,7 @@ Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https:/ * Hardware support: Apple Silicon M1+, Intel/AMD CPU, and Intel Datacenter Max + Arc GPUs. * Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization. * IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max + ARc) support. -* Updated Marlin kernel from Neural Magic that is higly optimized for A100 +* Updated Marlin kernel from Neural Magic that is optimized for A100 (Ampere) * Updated Kernels with auto-padding for legacy model support and models with non-uniform in/out-features. * Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization apis. * User and developer friendly apis. From 945f6633dea723613e4961c20c69b2a1f92f1b4f Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Thu, 26 Dec 2024 23:34:22 +0800 Subject: [PATCH 35/42] column name revert: review --- docs/source/en/quantization/overview.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 08a4d719ece889..b09c7b55148b71 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -45,22 +45,22 @@ In short, supporting a wide range of quantization methods allows you to pick the Use the table below to help you decide which quantization method to use. -| Quantization Method | Runtime Quantization | CPU | CUDA GPU | ROCm GPU | Metal (Apple Silicon) | Intel GPU | Torch compile() | Bits | PEFT Fine Tuning | Serializable with πŸ€—Transformers | πŸ€—Transformers Support | Link to library | +| Quantization Method | On the fly quantization | CPU | CUDA GPU | ROCm GPU | Metal (Apple Silicon) | Intel GPU | Torch compile() | Bits | PEFT Fine Tuning | Serializable with πŸ€—Transformers | πŸ€—Transformers Support | Link to library | |-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------| -| [AQLM](./aqlm.md) | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 2 | 🟒 | 🟒 | 🟒 | https://github.com/Vahe1994/AQLM | +| [AQLM](./aqlm.md) | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1/2 | 🟒 | 🟒 | 🟒 | https://github.com/Vahe1994/AQLM | | [AWQ](./awq.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | ? | 4 | 🟒 | 🟒 | 🟒 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes.md) | 🟒 | 🟑 1 | 🟒 | 🟑 1 | πŸ”΄ 2 | 🟑 1 | πŸ”΄ 1 | 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/bitsandbytes-foundation/bitsandbytes | -| [compressed-tensors](./compressed_tensors.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 1 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/neuralmagic/compressed-tensors | +| [bitsandbytes](./bitsandbytes.md) | 🟒 | 🟑 1 | 🟒 | 🟑 1 | πŸ”΄ 2 | 🟑 1 | πŸ”΄ 1 | 4/8 | 🟒 | 🟒 | 🟒 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [compressed-tensors](./compressed_tensors.md) | πŸ”΄ | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 1/8 | 🟒 | 🟒 | 🟒 | https://github.com/neuralmagic/compressed-tensors | | [EETQ](./eetq.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | ? | 8 | 🟒 | 🟒 | 🟒 | https://github.com/NetEase-FuXi/EETQ | -| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | 1 / 8 | πŸ”΄ | [See Notes](../gguf.md) | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp | -| [GPTQModel](./gptq.md) | πŸ”΄ | 🟒 3 | 🟒 | 🟒 | 🟒 | 🟒 4 | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/ModelCloud/GPTQModel | -| [AutoGPTQ](./gptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 2 / 3 / 4 / 8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HIGGS](./higgs.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 2 / 4 | πŸ”΄ | 🟒 | 🟒 | https://github.com/HanGuo97/flute | -| [HQQ](./hqq.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | 🟒 | πŸ”΄ | 🟒 | https://github.com/mobiusml/hqq/ | -| [optimum-quanto](./quanto.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | 🟒 | 2 / 4 / 8 | πŸ”΄ | πŸ”΄ | 🟒 | https://github.com/huggingface/optimum-quanto | +| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | 1/8 | πŸ”΄ | [See Notes](../gguf.md) | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp | +| [GPTQModel](./gptq.md) | πŸ”΄ | 🟒 3 | 🟒 | 🟒 | 🟒 | 🟒 4 | πŸ”΄ | 2/3/4/8 | 🟒 | 🟒 | 🟒 | https://github.com/ModelCloud/GPTQModel | +| [AutoGPTQ](./gptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 2/3/4/8 | 🟒 | 🟒 | 🟒 | https://github.com/AutoGPTQ/AutoGPTQ | +| [HIGGS](./higgs.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 2/4 | πŸ”΄ | 🟒 | 🟒 | https://github.com/HanGuo97/flute | +| [HQQ](./hqq.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | 🟒 | 1/8 | 🟒 | πŸ”΄ | 🟒 | https://github.com/mobiusml/hqq/ | +| [optimum-quanto](./quanto.md) | 🟒 | 🟒 | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | 🟒 | 2/4/8 | πŸ”΄ | πŸ”΄ | 🟒 | https://github.com/huggingface/optimum-quanto | | [FBGEMM_FP8](./fbgemm_fp8.md) | 🟒 | πŸ”΄ | 🟒 | πŸ”΄ | πŸ”΄ | πŸ”΄ | πŸ”΄ | 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/pytorch/FBGEMM | -| [torchao](./torchao.md) | 🟒 | | 🟒 | πŸ”΄ | 🟑 5 | πŸ”΄ | | 4 / 8 | | πŸŸ’πŸ”΄ | 🟒 | https://github.com/pytorch/ao | -| [VPTQ](./vptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟑 | πŸ”΄ | πŸ”΄ | 🟒 | 1 / 8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/microsoft/VPTQ | +| [torchao](./torchao.md) | 🟒 | | 🟒 | πŸ”΄ | 🟑 5 | πŸ”΄ | | 4/8 | | πŸŸ’πŸ”΄ | 🟒 | https://github.com/pytorch/ao | +| [VPTQ](./vptq.md) | πŸ”΄ | πŸ”΄ | 🟒 | 🟑 | πŸ”΄ | πŸ”΄ | 🟒 | 1/8 | πŸ”΄ | 🟒 | 🟒 | https://github.com/microsoft/VPTQ | From 91d12ccf19b42f6d5282133ed3e8297ef05b45b1 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Thu, 9 Jan 2025 19:22:52 +0800 Subject: [PATCH 36/42] doc rocm support --- docs/source/en/quantization/gptq.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 6565eb2410fce3..eebf0536affe36 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -28,8 +28,8 @@ Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https:/ * Model support: GPTQModel continues to support all of the latest released LLM models. * Multi-Modal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. -* Platform support: Validated MacOS Apple Silicon and Windows 11 support. -* Hardware support: Apple Silicon M1+, Intel/AMD CPU, and Intel Datacenter Max + Arc GPUs. +* Platform support: Linux, MacOS (Apple Silicon), and Windows 11. +* Hardware support: Nvidia CUDA, AMD ROCm, Apple Silicon M1+ MPS + CPU, Intel/AMD CPU, and Intel Datacenter Max + Arc GPUs. * Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization. * IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max + ARc) support. * Updated Marlin kernel from Neural Magic that is optimized for A100 (Ampere) From 1ec6fe7646c4ca52d26864044e3392820466f20c Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 10 Jan 2025 09:45:19 +0800 Subject: [PATCH 37/42] Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/gptq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index eebf0536affe36..7f75898145d843 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -22,7 +22,7 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google -Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory-usage by almost 4x because the int4 weights are often dequantized in a fused kernel. One can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth. +Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implement the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory by almost 4x because the int4 weights are often dequantized in a fused kernel. You can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth. [GPTQModel](https://github.com/ModelCloud/GPTQModel) has its origin as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences: From 7d2b7085d60023bb6cccdad3d9ef8f9ac88612ae Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 10 Jan 2025 09:45:38 +0800 Subject: [PATCH 38/42] Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/gptq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 7f75898145d843..84dddbe53d0d3a 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -24,7 +24,7 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implement the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory by almost 4x because the int4 weights are often dequantized in a fused kernel. You can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth. -[GPTQModel](https://github.com/ModelCloud/GPTQModel) has its origin as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences: +[GPTQModel](https://github.com/ModelCloud/GPTQModel) started as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences. * Model support: GPTQModel continues to support all of the latest released LLM models. * Multi-Modal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. From 8c2a8b3835e2a09a96977b213bdbcd01f92b986f Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 10 Jan 2025 09:46:05 +0800 Subject: [PATCH 39/42] Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/gptq.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 84dddbe53d0d3a..b510507c89bf57 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -26,16 +26,16 @@ Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https:/ [GPTQModel](https://github.com/ModelCloud/GPTQModel) started as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences. -* Model support: GPTQModel continues to support all of the latest released LLM models. -* Multi-Modal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. -* Platform support: Linux, MacOS (Apple Silicon), and Windows 11. -* Hardware support: Nvidia CUDA, AMD ROCm, Apple Silicon M1+ MPS + CPU, Intel/AMD CPU, and Intel Datacenter Max + Arc GPUs. +* Model support: GPTQModel continues to support all of the latest LLM models. +* Multimodal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. +* Platform support: Linux, macOS (Apple Silicon), and Windows 11. +* Hardware support: NVIDIA CUDA, AMD ROCm, Apple Silicon M1/MPS /CPU, Intel/AMD CPU, and Intel Datacenter Max/Arc GPUs. * Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization. -* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max + ARc) support. -* Updated Marlin kernel from Neural Magic that is optimized for A100 (Ampere) -* Updated Kernels with auto-padding for legacy model support and models with non-uniform in/out-features. -* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization apis. -* User and developer friendly apis. +* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max/Arc GPUs) support. +* Updated Marlin kernel from Neural Magic optimized for A100 (Ampere). +* Updated kernels with auto-padding for legacy model support and models with non-uniform in/out-features. +* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization APIs. +* User and developer friendly APIs. [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) will likely be deprecated in the future due the lack of continued support for new models and features. From 053e0adc4f435d7ff9590f7df91ae1ec71564013 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 10 Jan 2025 09:46:45 +0800 Subject: [PATCH 40/42] Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/gptq.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index b510507c89bf57..1534a977f3436f 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -121,9 +121,9 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de ## Marlin -[Marlin](https://github.com/IST-DASLab/marlin) is a CUDA gptq kernel, 4-bit only, that is highly optimized for the Nvidia A100 GPU (Ampere) architecture where the the loading, dequantization, and execution of post-dequantized weights are highly parallelized offering a substantial inference improvement versus the original CUDA gptq kernel. Marlin is only available for quantized inference and does not support model quantization. +[Marlin](https://github.com/IST-DASLab/marlin) is a 4-bit only CUDA GPTQ kernel, highly optimized for the NVIDIA A100 GPU (Ampere) architecture. Loading, dequantization, and execution of post-dequantized weights are highly parallelized, offering a substantial inference improvement versus the original CUDA GPTQ kernel. Marlin is only available for quantized inference and does not support model quantization. -Marlin inference can be activated via the `backend` property in `GPTQConfig` for GPTQModel: +Marlin inference can be activated with the `backend` parameter in [`GPTQConfig`]. ```py From d3bfbb00c54ddc487c3febb37f18e5fe2a36b810 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 10 Jan 2025 09:46:55 +0800 Subject: [PATCH 41/42] Update docs/source/en/quantization/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index b09c7b55148b71..aaf424a121d11f 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -76,7 +76,7 @@ Use the table below to help you decide which quantization method to use. -**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/Amd/Apple Silicon. +**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/AMD/Apple Silicon. From 1d883ec056bc40b19742e54b419b84ad1dbcaac9 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 10 Jan 2025 09:47:05 +0800 Subject: [PATCH 42/42] Update docs/source/en/quantization/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index aaf424a121d11f..dfe680832b1952 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -82,7 +82,7 @@ Use the table below to help you decide which quantization method to use. -**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max + Arc. +**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max/Arc GPUs.