ModelTC · Harahan · Aug 26, 2024 · Aug 26, 2024
diff --git a/llmc/compression/quantization/awq.py b/llmc/compression/quantization/awq.py
@@ -40,7 +40,10 @@ def get_weight_scale(self, layers_dict):
         )
         weights = wquantizer.reshape_tensor(weights)
         scale = weights.abs() / weights.abs().amax(dim=1, keepdim=True)
-        scale = scale.view(org_shape)
+        try:
+            scale = scale.view(org_shape)
+        except RuntimeError:
+            scale = wquantizer.restore_tensor(scale, org_shape)
         scale = scale.mean(0)
         del weights
         gc.collect()

diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -447,7 +447,7 @@ def scale_ln_fcs(self, ln, fcs, scales):
         scales = scales.to(ln.weight.device)
         ln.weight.div_(scales)
 
-        if self.model.has_bias():
+        if hasattr(ln, 'bias') and ln.bias is not None:
             ln.bias.div_(scales)
 
         for fc in fcs:
@@ -505,12 +505,20 @@ def apply_clip(self, layer, min_val, max_val, layer_name):
         if self.clip_version == 'v1':
             max_val = max_val.to(layer.weight.device)
             org_shape = layer.weight.shape
-            layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
+            try:
+                layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
+            except RuntimeError:
+                layer.weight.data = self.wquantizer.reshape_tensor(layer.weight.data)
+                layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
             if self.clip_sym:
                 min_val = -max_val
 
             layer.weight.data = torch.clamp(layer.weight.data, min_val, max_val)
-            layer.weight.data = layer.weight.data.reshape(org_shape)
+            try:
+                layer.weight.data = layer.weight.data.reshape(org_shape)
+            except RuntimeError:
+                layer.weight.data = self.wquantizer \
+                    .restore_tensor(layer.weight.data, org_shape)
         elif self.clip_version == 'v2':
             up_factor, low_factor = self.get_clip_factor(
                 layer, min_val, max_val, layer_name
@@ -585,7 +593,11 @@ def auto_clip_layer(
         else:
             group_size = w.shape[1]
 
-        w = w.reshape(w.shape[0], 1, -1, group_size)
+        try:
+            w = w.reshape(w.shape[0], 1, -1, group_size)
+        except RuntimeError:
+            w = self.wquantizer.reshape_tensor(w)
+            w = w.reshape(w.shape[0], 1, -1, group_size)
         oc_batch_size = 256 if w.shape[0] % 256 == 0 else 64  # prevent OOM
         assert w.shape[0] % oc_batch_size == 0
 
@@ -622,7 +634,11 @@ def auto_clip_layer(
                     input[i] = input[i].to(w.device)
                     x = input[i]
                     x = x.view(-1, x.shape[-1])
-                    x = x.reshape(1, x.shape[0], -1, group_size)
+                    try:
+                        x = x.reshape(1, x.shape[0], -1, group_size)
+                    except RuntimeError:
+                        x = self.wquantizer.reshape_tensor(x)
+                        x = x.reshape(1, x.shape[0], -1, group_size)
                     x = x[:, 0:: x.shape[1] // n_sample_token]
                     if i in org_out_dict:
                         org_out = org_out_dict[i]

diff --git a/llmc/compression/quantization/gptq.py b/llmc/compression/quantization/gptq.py
@@ -354,8 +354,8 @@ def collect_model_qparams(self):
     @torch.no_grad()
     def split_qparams(self, qparams):
         group_qparams = []
-        group_num = self.columns // self.wquantizer.group_size
-        qparams = qparams.reshape(qparams.shape[0] // group_num, -1)
+        group_num = math.ceil(self.columns / self.wquantizer.group_size)
+        qparams = qparams.reshape(math.ceil(qparams.shape[0] / group_num), -1)
         qparams = qparams.t()
         group_qparams = list(torch.split(qparams, 1, dim=0))
         for i in range(len(group_qparams)):

diff --git a/llmc/compression/quantization/quant.py b/llmc/compression/quantization/quant.py
@@ -245,7 +245,17 @@ def quant_dequant(self, tensor, scales, zeros, max_int, min_int):
     def reshape_tensor(self, tensor):
         if self.granularity == 'per_group':
             if tensor.shape[1] >= self.group_size:
-                t = tensor.reshape(-1, self.group_size)
+                if tensor.shape[1] % self.group_size == 0:
+                    t = tensor.reshape(-1, self.group_size)
+                else:
+                    deficiency = self.group_size - tensor.shape[1] % self.group_size
+                    prefix = tensor.shape[:-1]
+                    pad_zeros = torch.zeros(
+                        (*prefix, deficiency),
+                        device=tensor.device, dtype=tensor.dtype)
+                    t = torch.cat(
+                        (tensor, pad_zeros),
+                        dim=-1).reshape(-1, self.group_size)
             else:
                 t = tensor
         elif self.granularity == 'per_head':
@@ -258,7 +268,11 @@ def restore_tensor(self, tensor, shape):
         if tensor.shape == shape:
             t = tensor
         else:
-            t = tensor.reshape(shape)
+            try:
+                t = tensor.reshape(shape)
+            except RuntimeError:
+                deficiency = self.group_size - shape[1] % self.group_size
+                t = tensor.reshape(*shape[:-1], -1)[..., :-deficiency]
         return t
 
     def fake_quant_act_static(self, act, args={}):