Skip to content

Commit

Permalink
fix format and convert v2 to v1
Browse files Browse the repository at this point in the history
Signed-off-by: jiqing-feng <[email protected]>
  • Loading branch information
jiqing-feng committed Dec 16, 2024
1 parent 5979473 commit 3603a0b
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 12 deletions.
22 changes: 13 additions & 9 deletions optimum/gptq/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@

from ..utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available
from ..utils.modeling_utils import recurse_getattr
from ..version import __version__ as optimum_version
from .constants import GPTQ_CONFIG
from .data import get_dataset, prepare_dataset
from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
from ..version import __version__ as optimum_version


if is_accelerate_available():
Expand All @@ -43,11 +43,11 @@
from accelerate.hooks import remove_hook_from_module

if is_auto_gptq_available():
from auto_gptq import __version__ as autogptq_version
from auto_gptq import exllama_set_max_input_length
from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init
from auto_gptq.quantization import GPTQ
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
from auto_gptq import __version__ as autogptq_version

if is_gptqmodel_available():
from gptqmodel import exllama_set_max_input_length
Expand Down Expand Up @@ -128,8 +128,7 @@ def __init__(
Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
backend (`str`, *optional*):
Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only
valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
use_cuda_fp16 (`bool`, defaults to `False`):
Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
model_seqlen (`Optional[int]`, defaults to `None`):
Expand Down Expand Up @@ -246,7 +245,7 @@ def to_dict(self):

if gptq_dict.get("meta") is None:
gptq_dict["meta"] = {}

meta = gptq_dict["meta"]
# store both optimum:version and gptq_lib:version into quantize_config.meta.quantizer
if meta.get("quantizer") is None:
Expand Down Expand Up @@ -719,7 +718,9 @@ class StoreAttr(object):
pass

if is_gptqmodel_available():
model, _ = hf_convert_gptq_v1_to_v2_format(model, self.bits, self.quant_linear, self.checkpoint_format, self.meta)
model, _ = hf_convert_gptq_v1_to_v2_format(
model, self.bits, self.quant_linear, self.checkpoint_format, self.meta
)

model.quantize_config = StoreAttr()
model.quantize_config.desc_act = self.desc_act
Expand Down Expand Up @@ -790,9 +791,12 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa
"""

# convert gptqmodel internal gptq_v2 format to v1 for max compatibility
model, converted = hf_convert_gptq_v2_to_v1_format(model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta)
if converted:
self.checkpoint_format = "gptq"
if is_gptqmodel_available():
model, converted = hf_convert_gptq_v2_to_v1_format(
model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta
)
if converted:
self.checkpoint_format = "gptq"

os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
Expand Down
2 changes: 1 addition & 1 deletion optimum/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0")
DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0")
AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99") # Allows 0.5.0.dev0
GPTQMODEL_MINIMUM_VERSION = version.parse("1.3.99") # Allows 1.4.0.dev0
GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.1") # Allows 1.4.0.dev0


# This is the minimal required version to support some ONNX Runtime features
Expand Down
5 changes: 3 additions & 2 deletions tests/gptq/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ class GPTQTestCUDA(GPTQTest):
expected_fp16_perplexity = 38
expected_quantized_perplexity = 45


def test_perplexity(self):
"""
A simple test to check if the model conversion has been done correctly by checking on the
Expand Down Expand Up @@ -309,7 +308,9 @@ def test_exllama_serialization(self):
save_folder=tmpdirname,
device_map={"": self.device_for_inference},
)
self.check_quantized_layers_type(quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2")
self.check_quantized_layers_type(
quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2"
)

# transformers and auto-gptq compatibility
# quantized models are more compatible with device map than
Expand Down

0 comments on commit 3603a0b

Please sign in to comment.