Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix ln_fc fuse && support 128g for abitrary shape #49

Merged
merged 1 commit into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion llmc/compression/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ def get_weight_scale(self, layers_dict):
)
weights = wquantizer.reshape_tensor(weights)
scale = weights.abs() / weights.abs().amax(dim=1, keepdim=True)
scale = scale.view(org_shape)
try:
scale = scale.view(org_shape)
except RuntimeError:
scale = wquantizer.restore_tensor(scale, org_shape)
scale = scale.mean(0)
del weights
gc.collect()
Expand Down
26 changes: 21 additions & 5 deletions llmc/compression/quantization/base_blockwise_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def scale_ln_fcs(self, ln, fcs, scales):
scales = scales.to(ln.weight.device)
ln.weight.div_(scales)

if self.model.has_bias():
if hasattr(ln, 'bias') and ln.bias is not None:
ln.bias.div_(scales)

for fc in fcs:
Expand Down Expand Up @@ -505,12 +505,20 @@ def apply_clip(self, layer, min_val, max_val, layer_name):
if self.clip_version == 'v1':
max_val = max_val.to(layer.weight.device)
org_shape = layer.weight.shape
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
try:
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
except RuntimeError:
layer.weight.data = self.wquantizer.reshape_tensor(layer.weight.data)
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
if self.clip_sym:
min_val = -max_val

layer.weight.data = torch.clamp(layer.weight.data, min_val, max_val)
layer.weight.data = layer.weight.data.reshape(org_shape)
try:
layer.weight.data = layer.weight.data.reshape(org_shape)
except RuntimeError:
layer.weight.data = self.wquantizer \
.restore_tensor(layer.weight.data, org_shape)
elif self.clip_version == 'v2':
up_factor, low_factor = self.get_clip_factor(
layer, min_val, max_val, layer_name
Expand Down Expand Up @@ -585,7 +593,11 @@ def auto_clip_layer(
else:
group_size = w.shape[1]

w = w.reshape(w.shape[0], 1, -1, group_size)
try:
w = w.reshape(w.shape[0], 1, -1, group_size)
except RuntimeError:
w = self.wquantizer.reshape_tensor(w)
w = w.reshape(w.shape[0], 1, -1, group_size)
oc_batch_size = 256 if w.shape[0] % 256 == 0 else 64 # prevent OOM
assert w.shape[0] % oc_batch_size == 0

Expand Down Expand Up @@ -622,7 +634,11 @@ def auto_clip_layer(
input[i] = input[i].to(w.device)
x = input[i]
x = x.view(-1, x.shape[-1])
x = x.reshape(1, x.shape[0], -1, group_size)
try:
x = x.reshape(1, x.shape[0], -1, group_size)
except RuntimeError:
x = self.wquantizer.reshape_tensor(x)
x = x.reshape(1, x.shape[0], -1, group_size)
x = x[:, 0:: x.shape[1] // n_sample_token]
if i in org_out_dict:
org_out = org_out_dict[i]
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,8 @@ def collect_model_qparams(self):
@torch.no_grad()
def split_qparams(self, qparams):
group_qparams = []
group_num = self.columns // self.wquantizer.group_size
qparams = qparams.reshape(qparams.shape[0] // group_num, -1)
group_num = math.ceil(self.columns / self.wquantizer.group_size)
qparams = qparams.reshape(math.ceil(qparams.shape[0] / group_num), -1)
qparams = qparams.t()
group_qparams = list(torch.split(qparams, 1, dim=0))
for i in range(len(group_qparams)):
Expand Down
18 changes: 16 additions & 2 deletions llmc/compression/quantization/quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,17 @@ def quant_dequant(self, tensor, scales, zeros, max_int, min_int):
def reshape_tensor(self, tensor):
if self.granularity == 'per_group':
if tensor.shape[1] >= self.group_size:
t = tensor.reshape(-1, self.group_size)
if tensor.shape[1] % self.group_size == 0:
t = tensor.reshape(-1, self.group_size)
else:
deficiency = self.group_size - tensor.shape[1] % self.group_size
prefix = tensor.shape[:-1]
pad_zeros = torch.zeros(
(*prefix, deficiency),
device=tensor.device, dtype=tensor.dtype)
t = torch.cat(
(tensor, pad_zeros),
dim=-1).reshape(-1, self.group_size)
else:
t = tensor
elif self.granularity == 'per_head':
Expand All @@ -258,7 +268,11 @@ def restore_tensor(self, tensor, shape):
if tensor.shape == shape:
t = tensor
else:
t = tensor.reshape(shape)
try:
t = tensor.reshape(shape)
except RuntimeError:
deficiency = self.group_size - shape[1] % self.group_size
t = tensor.reshape(*shape[:-1], -1)[..., :-deficiency]
return t

def fake_quant_act_static(self, act, args={}):
Expand Down
Loading