diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py index 41e5355e..4000a258 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py @@ -98,6 +98,9 @@ def get_lora_parameters(proj): base_layer = proj.base_layer if hasattr(proj, "base_layer") else proj qstate = extract_gptq_state(base_layer) + if base_layer.__module__.startswith("auto_gptq"): + setattr(qstate.qzeros, "offset", 1) + if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged: return qstate, None, None, None, None diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py index c252d26d..efc3b41e 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py @@ -110,7 +110,10 @@ def dequant_kernel_248( zeros = zeros & maxq # Dequantize - zeros = zeros + 1 + # None if using local gptqpackage, official autogptq should have an offset value + if getattr(qzeros_ptr, "offset", None) is not None: + zeros = zeros + qzeros_ptr.offset + weights = weights - zeros weights = weights.to(tl.float32) weights = scales * weights