From d9dc8e564fed1527be2a9d8d5c74daf3823c3203 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sat, 2 Mar 2024 11:10:57 +0100
Subject: [PATCH 01/18] Fix double bias (#383)

---
 awq/modules/linear/gemm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/awq/modules/linear/gemm.py b/awq/modules/linear/gemm.py
index 1eb942c2..6efc7ee1 100644
--- a/awq/modules/linear/gemm.py
+++ b/awq/modules/linear/gemm.py
@@ -253,7 +253,6 @@ def forward(self, x):
         if input_dtype != torch.float16:
             out = out.to(dtype=input_dtype)
 
-        out = out + self.bias if self.bias is not None else out
         return out.reshape(out_shape)
 
     def extra_repr(self) -> str:

From f713b888721b4d8d1137dbe153cbfc004c00db2f Mon Sep 17 00:00:00 2001
From: Oscar Savolainen <119876479+OscarSavolainenDR@users.noreply.github.com>
Date: Sat, 2 Mar 2024 05:11:50 -0500
Subject: [PATCH 02/18] x_max -> x_mean and w_max -> w_mean name changes and
 some comments (#378)

---
 awq/quantize/quantizer.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py
index 1bf89fd3..37b1dbdc 100644
--- a/awq/quantize/quantizer.py
+++ b/awq/quantize/quantizer.py
@@ -244,17 +244,23 @@ def _search_best_scale(
         # Put x on the right device
         inp = inp.to(next(module2inspect.parameters()).device)
 
-        # [STEP 1]: Compute maximum of weight
+        # [STEP 1]: Compute per-channel mean of normalised weights
+        # All layer weights are concatted together
         weight = torch.cat([_m.weight for _m in layers], dim=0)
         org_shape = weight.shape
+        # The weights are reshaped to be organised by quantization group
         weight = weight.view(-1, self.group_size)
+        # Calculates the relative magnitude of the weights within each of the quantization groups, 
+        # and rescales each group individually so that each group has weights on a 0-1 scale.
         w_scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True)
+        # Resizes the rescaled weight matrix back up to its original dimensions
         w_scale = w_scale.view(org_shape)
-        w_max = w_scale.mean(0)
+        # Gets the average rescaled magnitude for each output channel
+        w_mean = w_scale.mean(0)
         clear_memory(weight)
 
-        # [STEP 2]: Compute maximum of x
-        x_max = inp.abs().view(-1, inp.shape[-1]).mean(0)
+        # [STEP 2]: Compute per-channel mean of the input activation
+        x_mean = inp.abs().view(-1, inp.shape[-1]).mean(0)
 
         # [STEP 3]: Compute output of module
         with torch.no_grad():
@@ -266,7 +272,7 @@ def _search_best_scale(
 
         # [STEP 4]: Compute loss
         best_scales = self._compute_best_scale(
-            inp, w_max, x_max, module2inspect, layers, fp16_output, module_kwargs
+            inp, w_mean, x_mean, module2inspect, layers, fp16_output, module_kwargs
         )
 
         return (
@@ -278,8 +284,8 @@ def _search_best_scale(
     def _compute_best_scale(
         self,
         x,
-        w_max,
-        x_max,
+        w_mean,
+        x_mean,
         module2inspect,
         linears2scale: List[nn.Linear],
         fp16_output,
@@ -303,8 +309,8 @@ def _compute_best_scale(
         org_sd = {k: v.cpu() for k, v in module2inspect.state_dict().items()}
 
         device = x.device
-        x_max = x_max.view(-1).to(device)
-        w_max = w_max.view(-1).to(device)
+        x_mean = x_mean.view(-1).to(device)
+        w_mean = w_mean.view(-1).to(device)
 
         for ratio in range(n_grid):
             # create new scales
@@ -312,9 +318,9 @@ def _compute_best_scale(
 
             # NOTE: s^-1 * x is fused here, according to paper
             if self.duo_scaling:
-                scales = (x_max.pow(ratio) / w_max.pow(1 - ratio)).clamp(min=1e-4)
+                scales = (x_mean.pow(ratio) / w_mean.pow(1 - ratio)).clamp(min=1e-4)
             else:
-                scales = x_max.pow(ratio).clamp(min=1e-4).view(-1)
+                scales = x_mean.pow(ratio).clamp(min=1e-4).view(-1)
             scales = scales / (scales.max() * scales.min()).sqrt()
             scales_view = scales.view(1, -1).to(device)
 

From d8ca1e2fa8508131676e5a7c3ce63cc90a5adca4 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sat, 2 Mar 2024 11:13:07 +0100
Subject: [PATCH 03/18] Bump to 0.2.3

---
 awq/__init__.py            | 2 +-
 scripts/download_wheels.sh | 2 +-
 setup.py                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/awq/__init__.py b/awq/__init__.py
index e865f860..2160a0f4 100644
--- a/awq/__init__.py
+++ b/awq/__init__.py
@@ -1,2 +1,2 @@
-__version__ = "0.2.2"
+__version__ = "0.2.3"
 from awq.models.auto import AutoAWQForCausalLM
diff --git a/scripts/download_wheels.sh b/scripts/download_wheels.sh
index 4cccf014..70939155 100644
--- a/scripts/download_wheels.sh
+++ b/scripts/download_wheels.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Set variables
-AWQ_VERSION="0.2.2"
+AWQ_VERSION="0.2.3"
 RELEASE_URL="https://api.github.com/repos/casper-hansen/AutoAWQ/releases/tags/v${AWQ_VERSION}"
 
 # Create a directory to download the wheels
diff --git a/setup.py b/setup.py
index a580dc86..409f73d7 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@ def get_kernels_whl_url(
     return f"https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v{release_version}/autoawq_kernels-{release_version}+{gpu_system_version}-cp{python_version}-cp{python_version}-{platform}_{architecture}.whl"
 
 
-AUTOAWQ_VERSION = "0.2.2"
+AUTOAWQ_VERSION = "0.2.3"
 PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
 
 CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda

From 94e73f0b2abb1d5303d72231540e922e0484383d Mon Sep 17 00:00:00 2001
From: TechxGenus <jianghao0728@mail.ustc.edu.cn>
Date: Mon, 11 Mar 2024 22:15:10 +0800
Subject: [PATCH 04/18] Add Gemma Support (#393)

---
 awq/models/__init__.py     |   1 +
 awq/models/auto.py         |   1 +
 awq/models/base.py         |   1 +
 awq/models/gemma.py        | 149 +++++++++++++++++++++++++++++++++++++
 awq/modules/fused/attn.py  |  13 +++-
 awq/modules/fused/block.py |   8 ++
 awq/modules/fused/model.py |   4 +-
 awq/quantize/scale.py      |  13 +++-
 8 files changed, 182 insertions(+), 8 deletions(-)
 create mode 100644 awq/models/gemma.py

diff --git a/awq/models/__init__.py b/awq/models/__init__.py
index 14886a24..75542fe4 100644
--- a/awq/models/__init__.py
+++ b/awq/models/__init__.py
@@ -14,3 +14,4 @@
 from .llava import LlavaAWQForCausalLM
 from .mixtral import MixtralAWQForCausalLM
 from .qwen2 import Qwen2AWQForCausalLM
+from .gemma import GemmaAWQForCausalLM
diff --git a/awq/models/auto.py b/awq/models/auto.py
index c992061f..1ac6342a 100644
--- a/awq/models/auto.py
+++ b/awq/models/auto.py
@@ -23,6 +23,7 @@
     "baichuan": BaichuanAWQForCausalLM,
     "llava": LlavaAWQForCausalLM,
     "qwen2": Qwen2AWQForCausalLM,
+    "gemma": GemmaAWQForCausalLM,
 }
 
 
diff --git a/awq/models/base.py b/awq/models/base.py
index 8ef243ab..e5691ae0 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -67,6 +67,7 @@
     "baichuan": "AutoModelForCausalLM",
     "llava": "AutoModelForVision2Seq",
     "qwen2": "AutoModelForCausalLM",
+    "gemma": "AutoModelForCausalLM",
 }
 
 
diff --git a/awq/models/gemma.py b/awq/models/gemma.py
new file mode 100644
index 00000000..b3ed65db
--- /dev/null
+++ b/awq/models/gemma.py
@@ -0,0 +1,149 @@
+import tqdm
+import torch
+from typing import List, Tuple
+from .base import BaseAWQForCausalLM
+from awq.utils.fused_utils import fuse_qkv
+from awq.modules.fused.block import LlamaLikeBlock
+from awq.modules.fused.model import LlamaLikeModel
+from transformers.models.gemma.modeling_gemma import (
+    GemmaDecoderLayer as OldGemmaDecoderLayer,
+    GemmaForCausalLM as OldGemmaForCausalLM,
+)
+from awq.modules.fused.norm import FasterTransformerRMSNorm
+
+
+class GemmaAWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "GemmaDecoderLayer"
+    max_new_tokens_key = "max_position_embeddings"
+
+    @staticmethod
+    def fuse_layers(model: OldGemmaDecoderLayer):
+        fuser = GemmaFuser(model)
+        fuser.fuse_transformer()
+
+    @staticmethod
+    def get_model_layers(model: OldGemmaForCausalLM):
+        return model.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module: OldGemmaDecoderLayer):
+        return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model: OldGemmaForCausalLM, device: str):
+        model.model.embed_tokens = model.model.embed_tokens.to(device)
+
+    @staticmethod
+    def get_layers_for_scaling(module: OldGemmaDecoderLayer, input_feat, module_kwargs):
+        layers = []
+
+        # attention input
+        layers.append(
+            dict(
+                prev_op=module.input_layernorm,
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                kwargs=module_kwargs,
+            )
+        )
+
+        # attention out
+        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
+            layers.append(
+                dict(
+                    prev_op=module.self_attn.v_proj,
+                    layers=[module.self_attn.o_proj],
+                    inp=input_feat["self_attn.o_proj"],
+                )
+            )
+
+        # linear 1
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.gate_proj, module.mlp.up_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp,
+            )
+        )
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.up_proj,
+                layers=[module.mlp.down_proj],
+                inp=input_feat["mlp.down_proj"],
+            )
+        )
+
+        return layers
+
+
+class GemmaFuser:
+    def __init__(self, model: OldGemmaForCausalLM):
+        self.model = model
+
+        self.Gemma_blocks: List[Tuple[str, OldGemmaDecoderLayer]] = [
+            (name, module)
+            for name, module in self.model.named_modules()
+            if "GemmaDecoderLayer".lower() in module.__class__.__name__.lower()
+        ]
+
+    def fuse_transformer(self):
+        blocks = []
+
+        module: OldGemmaDecoderLayer
+        for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
+            device = next(iter(module.state_dict().values())).device
+            qkv = fuse_qkv(
+                module,
+                module.self_attn.q_proj,
+                module.self_attn.k_proj,
+                module.self_attn.v_proj,
+            )
+            with torch.no_grad():
+                # GemmaRMSNorm is different from Llama's in that it multiplies
+                # (1 + weight) to the output, instead of just weight.
+                module.input_layernorm.weight += 1
+                module.post_attention_layernorm.weight += 1
+            norm_1 = FasterTransformerRMSNorm(
+                module.input_layernorm.weight, module.input_layernorm.eps
+            )
+            norm_2 = FasterTransformerRMSNorm(
+                module.post_attention_layernorm.weight,
+                module.post_attention_layernorm.eps,
+            )
+            blocks.append(
+                LlamaLikeBlock(
+                    hidden_size=self.model.config.hidden_size,
+                    n_heads=self.model.config.num_attention_heads,
+                    n_kv_heads=self.model.config.num_key_value_heads,
+                    qkv_layer=qkv,
+                    o_proj=module.self_attn.o_proj,
+                    mlp=module.mlp,
+                    norm_1=norm_1,
+                    norm_2=norm_2,
+                    dev=device,
+                    max_seq_len=self.model.config.max_seq_len,
+                    rope_theta=self.model.config.rope_theta,
+                    head_dim=self.model.config.head_dim,
+                )
+            )
+        
+        with torch.no_grad():
+            # Normalize Gemma's embedding layer
+            self.model.model.embed_tokens.weight *= self.model.config.hidden_size**0.5
+        
+        self.model.model = LlamaLikeModel(
+            self.model.config.vocab_size,
+            blocks,
+            self.model.model.embed_tokens,
+            self.model.model.norm,
+        )
+        setattr(self.model.model, "blocks", self.model.model.blocks)
diff --git a/awq/modules/fused/attn.py b/awq/modules/fused/attn.py
index f90fd502..f1732ea5 100644
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -25,12 +25,12 @@
 
 
 class RoPE(nn.Module):
-    def __init__(self, hidden_size, n_heads, max_seq_len, device, rope_theta):
+    def __init__(self, head_dim, max_seq_len, device, rope_theta):
         super(RoPE, self).__init__()
 
         self.freqs_cis = nn.Parameter(
             self.precompute_freqs_cis(
-                hidden_size // n_heads, max_seq_len * 2, rope_theta
+                head_dim, max_seq_len * 2, rope_theta
             ).to(device),
             requires_grad=False,
         )
@@ -118,6 +118,7 @@ def __init__(
         use_alibi=False,
         attention_shapes=None,
         rope_theta=10000,
+        head_dim=None,
         **kwargs
     ):
         super().__init__()
@@ -125,7 +126,11 @@ def __init__(
         self.n_heads = n_heads
         self.n_kv_heads = n_kv_heads
         self.n_kv_groups = n_heads // n_kv_heads if n_kv_heads != 0 else 0
-        self.head_dim = self.hidden_size // n_heads
+        self.head_dim = head_dim
+        
+        if head_dim is None:
+            self.head_dim = hidden_size // n_heads
+
         self.qkv_proj = qkv_layer
         self.o_proj = o_proj
         self.start_pos = 0
@@ -162,7 +167,7 @@ def __init__(
             self.is_neox = False
         else:
             self.alibi = None
-            self.rope = RoPE(hidden_size, n_heads, max_seq_len, dev, rope_theta)
+            self.rope = RoPE(self.head_dim, max_seq_len, dev, rope_theta)
             self.rotary_dim = self.head_dim
             self.is_neox = True
 
diff --git a/awq/modules/fused/block.py b/awq/modules/fused/block.py
index 0ffc4b93..23cd954d 100644
--- a/awq/modules/fused/block.py
+++ b/awq/modules/fused/block.py
@@ -80,10 +80,17 @@ def __init__(
         max_seq_len,
         rope_theta=10000,
         use_alibi=False,
+        head_dim=None,
     ):
         super().__init__()
         self.n_heads = n_heads
         self.n_kv_heads = n_kv_heads
+        self.head_dim = hidden_size // n_heads
+
+        # To support gemma-7b, its head_dim is separate
+        if head_dim:
+            self.head_dim = head_dim
+
         self.hidden_size = hidden_size
         self.norm_1 = norm_1.to(dev)
         self.attn = QuantAttentionFused(
@@ -96,6 +103,7 @@ def __init__(
             max_seq_len=max_seq_len,
             use_alibi=use_alibi,
             rope_theta=rope_theta,
+            head_dim=head_dim,
         ).to(dev)
         self.norm_2 = norm_2.to(dev)
         self.mlp = mlp.to(dev)
diff --git a/awq/modules/fused/model.py b/awq/modules/fused/model.py
index c02233f6..c1ba2c1e 100644
--- a/awq/modules/fused/model.py
+++ b/awq/modules/fused/model.py
@@ -116,14 +116,14 @@ def forward(
                 h,
                 mask,
             )
-            h, _, past_key_value = layer(
+            h, _, _ = layer(
                 h, None, attention_mask=mask, is_causal=is_causal
             )
         h = self.norm(h)
 
         return BaseModelOutputWithPast(
             last_hidden_state=h,
-            past_key_values=past_key_value,
+            past_key_values=None,
             hidden_states=(),
             attentions=(),
         )
diff --git a/awq/quantize/scale.py b/awq/quantize/scale.py
index 0ee6ea05..47899cc5 100644
--- a/awq/quantize/scale.py
+++ b/awq/quantize/scale.py
@@ -6,9 +6,10 @@
 from awq.utils.module import get_op_by_name, set_op_by_name
 from transformers.models.bloom.modeling_bloom import BloomGelu
 from transformers.models.llama.modeling_llama import LlamaRMSNorm
+from transformers.models.gemma.modeling_gemma import GemmaRMSNorm
 from transformers.activations import NewGELUActivation, PytorchGELUTanh, GELUActivation
 
-allowed_norms = [nn.LayerNorm, LlamaRMSNorm]
+allowed_norms = [nn.LayerNorm, LlamaRMSNorm, GemmaRMSNorm]
 allowed_act_fns = [
     nn.GELU,
     BloomGelu,
@@ -88,7 +89,15 @@ def scale_ln_fcs(ln: nn.Linear, fcs: List[nn.Linear], scales: torch.Tensor):
 
     scales = scales.to(ln.weight.device)
 
-    ln.weight.div_(scales)
+    # GemmaRMSNorm is different from Llama's in that it multiplies
+    # (1 + weight) to the output, instead of just weight.
+    if isinstance(ln, GemmaRMSNorm):
+        ln.weight += 1
+        ln.weight.div_(scales)
+        ln.weight -= 1
+    else:
+        ln.weight.div_(scales)
+
     if hasattr(ln, "bias") and ln.bias is not None:
         ln.bias.div_(scales)
 

From 0f942181a42d2d7207a4f7f5d45f93e36738dd19 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 24 Mar 2024 12:24:51 +0100
Subject: [PATCH 05/18] Pin transformers>=4.35.0,<=4.38.2 (#408)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 409f73d7..a46b9443 100644
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,7 @@ def get_kernels_whl_url(
 
 requirements = [
     "torch>=2.0.1",
-    "transformers>=4.35.0",
+    "transformers>=4.35.0,<=4.38.2",
     "tokenizers>=0.12.1",
     "typing_extensions>=4.8.0",
     "accelerate",

From 0fa9a2c1573a13352c2fb05b0d5c654cf52f6ef8 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 24 Mar 2024 12:26:27 +0100
Subject: [PATCH 06/18] Bump to v0.2.4 (#409)

---
 awq/__init__.py            | 2 +-
 scripts/download_wheels.sh | 2 +-
 setup.py                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/awq/__init__.py b/awq/__init__.py
index 2160a0f4..1870e2bd 100644
--- a/awq/__init__.py
+++ b/awq/__init__.py
@@ -1,2 +1,2 @@
-__version__ = "0.2.3"
+__version__ = "0.2.4"
 from awq.models.auto import AutoAWQForCausalLM
diff --git a/scripts/download_wheels.sh b/scripts/download_wheels.sh
index 70939155..9f2c43a3 100644
--- a/scripts/download_wheels.sh
+++ b/scripts/download_wheels.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Set variables
-AWQ_VERSION="0.2.3"
+AWQ_VERSION="0.2.4"
 RELEASE_URL="https://api.github.com/repos/casper-hansen/AutoAWQ/releases/tags/v${AWQ_VERSION}"
 
 # Create a directory to download the wheels
diff --git a/setup.py b/setup.py
index a46b9443..c02ff895 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@ def get_kernels_whl_url(
     return f"https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v{release_version}/autoawq_kernels-{release_version}+{gpu_system_version}-cp{python_version}-cp{python_version}-{platform}_{architecture}.whl"
 
 
-AUTOAWQ_VERSION = "0.2.3"
+AUTOAWQ_VERSION = "0.2.4"
 PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
 
 CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda

From 5d7b0502c87b26e56cd9ceaf89d7076e1d6a15a8 Mon Sep 17 00:00:00 2001
From: TechxGenus <jianghao0728@mail.ustc.edu.cn>
Date: Sat, 6 Apr 2024 20:06:08 +0800
Subject: [PATCH 07/18] Fix fused models for tf >= 4.39 (#418)

---
 awq/modules/fused/model.py | 8 ++++++++
 setup.py                   | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/awq/modules/fused/model.py b/awq/modules/fused/model.py
index c1ba2c1e..8733722b 100644
--- a/awq/modules/fused/model.py
+++ b/awq/modules/fused/model.py
@@ -83,6 +83,14 @@ def __init__(self, vocab_size, blocks, embedding, norm):
         self.blocks: List[LlamaLikeBlock] = nn.ModuleList(blocks)
         self.norm = norm
         self.last_forward_num_tokens = 0
+        
+    @property
+    def embed_tokens(self):
+        return self.embedding
+    
+    @property
+    def layers(self):
+        return self.blocks
 
     @torch.inference_mode()
     def forward(
diff --git a/setup.py b/setup.py
index c02ff895..8ef3a96f 100644
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,7 @@ def get_kernels_whl_url(
 
 requirements = [
     "torch>=2.0.1",
-    "transformers>=4.35.0,<=4.38.2",
+    "transformers>=4.35.0",
     "tokenizers>=0.12.1",
     "typing_extensions>=4.8.0",
     "accelerate",

From 1f07200a2659c27f159cf8dda730096a77f9ccce Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Sat, 6 Apr 2024 14:08:33 +0200
Subject: [PATCH 08/18] FIX: Add safe guards for static cache + llama on
 transformers latest (#401)

---
 awq/modules/fused/attn.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/awq/modules/fused/attn.py b/awq/modules/fused/attn.py
index f1732ea5..9775126b 100644
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -188,16 +188,19 @@ def forward(
             # Always reset to 0
             self.start_pos = 0
 
+        hf_is_generating = False
+
+        if self.is_hf_transformers and "use_cache" in kwargs:
+            hf_is_generating = kwargs["use_cache"]
+
+
         # In case we re-generate, we need to refresh the starting position
         # to 0. We detect it by checking if `past_key_values` is set to None,
         # which indicates that we are on the first step of `generate()`.
         # This is only applicable for `transformers` integration
-        if (
-            self.is_hf_transformers
-            and "past_key_value" in kwargs
-            and kwargs["past_key_value"] is None
-        ):
+        if (self.is_hf_transformers and "past_key_value" in kwargs and kwargs["past_key_value"] is None) or (self.is_hf_transformers and not hf_is_generating):
             self.start_pos = 0
+    
 
         xqkv = self.qkv_proj(hidden_states)
         xqkv = xqkv.view((bsz, seqlen) + self.attention_shapes["xqkv_view"])
@@ -214,8 +217,6 @@ def forward(
             if not self.use_alibi:
                 xq, xk = self.rope.forward(xq, xk, self.start_pos, seqlen)
 
-            self.cache.to(xq)
-
             values_store = xv.transpose(2, 1)
             keys_store = (
                 xk.reshape((bsz, seqlen) + self.attention_shapes["xk_reshape"])
@@ -223,6 +224,7 @@ def forward(
                 .contiguous()
             )
 
+            self.cache.to(xq)
             self.cache.update_kv(values_store, keys_store, bsz, self.start_pos, seqlen)
 
             # Only necessary to retrieve from cache when we are not processing context
@@ -248,6 +250,11 @@ def forward(
 
             # When seqlen is 1, there is nothing else to attend to
             if attention_mask is not None and seqlen > 1:
+                # For llama-arch, the causal mask is preallocated with bsz x 1 x max_seq_len x max_seq_len, thus we 
+                # need to slice it
+                if attention_mask.shape[-1] != seqlen:
+                    attention_mask = attention_mask[:, :, :seqlen, :seqlen]
+
                 scores = (
                     scores + attention_mask
                 )  # (bs, n_local_heads, slen, cache_len + slen)
@@ -278,11 +285,15 @@ def forward(
         attn_output = self.o_proj(attention_weight)
         self.start_pos += seqlen
 
+        if self.is_hf_transformers and not hf_is_generating:
+            self.start_pos = 0
+
         # past_key_value is replaced with cache_v, cache_k, returning empty data
         # we pass a dummy past kv cache for transformers to be able to retrieve the correct info
         # about past key length
         past_key_value = [torch.zeros(1, 1, self.start_pos, 1)]
 
+
         if HF_NEW_CACHE_FORMAT and self.is_hf_transformers:
             new_cache = DynamicCache()
             new_cache.update(past_key_value[0], past_key_value[0], layer_idx=0)

From c780d650b756708b49df348ea452b27f6fec5e63 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sat, 6 Apr 2024 14:15:41 +0200
Subject: [PATCH 09/18] Pin: lm_eval==0.4.1 (#426)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8ef3a96f..e81ee71a 100644
--- a/setup.py
+++ b/setup.py
@@ -147,7 +147,7 @@ def get_kernels_whl_url(
     packages=find_packages(),
     install_requires=requirements,
     extras_require={
-        "eval": ["lm_eval>=0.4.0", "tabulate", "protobuf", "evaluate", "scipy"],
+        "eval": ["lm_eval==0.4.1", "tabulate", "protobuf", "evaluate", "scipy"],
         "dev": ["black", "mkdocstrings-python", "mkdocs-material", "griffe-typingdoc"]
     },
     **common_setup_kwargs,

From b5db7fcdb17613bd6c0e5b963fecfb5bb1347021 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sat, 6 Apr 2024 14:36:55 +0200
Subject: [PATCH 10/18] Implement `apply_clip` argument to `quantize()` (#427)

---
 awq/models/base.py        |  7 +++++++
 awq/quantize/quantizer.py | 17 ++++++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/awq/models/base.py b/awq/models/base.py
index e5691ae0..e09336fa 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -136,6 +136,12 @@ def quantize(
                 "This argument avoids real quantization by only applying the scales without quantizing down to FP16."
             ),
         ] = False,
+        apply_clip: Annotated[
+            bool,
+            Doc(
+                "Whether to apply clipping to the model during quantization. Some models may perform better with this set to False."
+            ),
+        ] = True,
     ):
         """
         The main quantization function that you can use to quantize your model.
@@ -173,6 +179,7 @@ def quantize(
             duo_scaling,
             modules_to_not_convert=self.quant_config.modules_to_not_convert,
             export_compatible=export_compatible,
+            apply_clip=apply_clip,
         )
         self.quantizer.quantize()
 
diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py
index 37b1dbdc..6a4574e6 100644
--- a/awq/quantize/quantizer.py
+++ b/awq/quantize/quantizer.py
@@ -40,6 +40,7 @@ def __init__(
         duo_scaling,
         modules_to_not_convert=None,
         export_compatible=False,
+        apply_clip=True,
     ) -> None:
         self.awq_model = awq_model
         self.model = model
@@ -53,6 +54,7 @@ def __init__(
         self.text_column = text_column
         self.duo_scaling = duo_scaling
         self.export_compatible = export_compatible
+        self.apply_clip = apply_clip
         self.modules_to_not_convert = (
             modules_to_not_convert if modules_to_not_convert is not None else []
         )
@@ -161,13 +163,14 @@ def quantize(self):
             )
 
             # [STEP 3]: Compute and apply clipping list
-            clip_list = self._search_best_clip(
-                self.modules[i], named_linears, input_feat
-            )
-            apply_clip(self.modules[i], clip_list)
-            clip_list = append_str_prefix(
-                clip_list, get_op_name(self.model, self.modules[i]) + "."
-            )
+            if self.apply_clip:
+                clip_list = self._search_best_clip(
+                    self.modules[i], named_linears, input_feat
+                )
+                apply_clip(self.modules[i], clip_list)
+                clip_list = append_str_prefix(
+                    clip_list, get_op_name(self.model, self.modules[i]) + "."
+                )
 
             # [STEP 4]: Quantize weights
             if not self.export_compatible:

From f83537937f9731c0b669efa92425ac7c06311c5c Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sat, 6 Apr 2024 14:37:53 +0200
Subject: [PATCH 11/18] Workaround: illegal memory access (#421)

---
 awq/modules/linear/gemv_fast.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/awq/modules/linear/gemv_fast.py b/awq/modules/linear/gemv_fast.py
index 6e75bd06..78e35d80 100644
--- a/awq/modules/linear/gemv_fast.py
+++ b/awq/modules/linear/gemv_fast.py
@@ -189,7 +189,8 @@ def from_linear(
     @torch.no_grad()
     def forward(self, x):
         inputs = x
-        if inputs.numel() / inputs.shape[-1] < 8:
+        batch_size, n_tokens, _ = inputs.shape
+        if batch_size < 8 and n_tokens == 1:
             out = awq_v2_ext.gemv_forward_cuda_decode(
                 inputs,
                 self.qweight,

From eb85f67d36ccd72e7cdf5cdc29954a265603e062 Mon Sep 17 00:00:00 2001
From: Roshiago <redko10maksim@gmail.com>
Date: Sat, 6 Apr 2024 15:41:35 +0300
Subject: [PATCH 12/18] Add download_kwargs for load model (#302) (#399)

---
 awq/models/auto.py |  4 ++++
 awq/models/base.py | 27 ++++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/awq/models/auto.py b/awq/models/auto.py
index 1ac6342a..cf35a279 100644
--- a/awq/models/auto.py
+++ b/awq/models/auto.py
@@ -51,6 +51,7 @@ def from_pretrained(
         trust_remote_code=True,
         safetensors=True,
         device_map=None,
+        download_kwargs=None,
         **model_init_kwargs,
     ) -> BaseAWQForCausalLM:
         model_type = check_and_get_model_type(
@@ -63,6 +64,7 @@ def from_pretrained(
             trust_remote_code=trust_remote_code,
             safetensors=safetensors,
             device_map=device_map,
+            download_kwargs=download_kwargs,
             **model_init_kwargs,
         )
 
@@ -80,6 +82,7 @@ def from_quantized(
         safetensors=True,
         device_map="balanced",
         offload_folder=None,
+        download_kwargs=None,
         **config_kwargs,
     ) -> BaseAWQForCausalLM:
         os.environ["AWQ_BATCH_SIZE"] = str(batch_size)
@@ -104,5 +107,6 @@ def from_quantized(
             safetensors=safetensors,
             device_map=device_map,
             offload_folder=offload_folder,
+            download_kwargs=download_kwargs,
             **config_kwargs,
         )
diff --git a/awq/models/base.py b/awq/models/base.py
index e09336fa..12607348 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -297,6 +297,9 @@ def from_pretrained(
                 "A device map that will be passed onto the model loading method from transformers."
             ),
         ] = None,
+        download_kwargs: Annotated[
+            Dict, Doc("Used for configure download model"),
+        ] = None,
         **model_init_kwargs: Annotated[
             Dict,
             Doc(
@@ -307,7 +310,9 @@ def from_pretrained(
         """A method for initialization of pretrained models, usually in FP16."""
         # Get weights path and quant config
         model_weights_path, config, quant_config = self._load_config(
-            self, model_path, "", safetensors, trust_remote_code=trust_remote_code
+            self, model_path, "", safetensors,
+            trust_remote_code=trust_remote_code,
+            download_kwargs=download_kwargs
         )
 
         target_cls_name = TRANSFORMERS_AUTO_MAPPING_DICT[config.model_type]
@@ -390,6 +395,9 @@ def from_quantized(
             str,
             Doc("The folder ot offload the model to."),
         ] = None,
+        download_kwargs: Annotated[
+            Dict, Doc("Used for configure download model"),
+        ] = None,
         **config_kwargs: Annotated[
             Dict,
             Doc(
@@ -406,6 +414,7 @@ def from_quantized(
             safetensors,
             trust_remote_code,
             max_seq_len=max_seq_len,
+            download_kwargs=download_kwargs,
             **config_kwargs,
         )
 
@@ -477,6 +486,7 @@ def _load_config(
         safetensors=True,
         trust_remote_code=True,
         max_seq_len=4096,
+        download_kwargs=None,
         **config_kwargs,
     ):
         # [STEP 1] Download model if path is not a directory
@@ -486,8 +496,19 @@ def _load_config(
                 ignore_patterns.extend(["*.pt*", "*.bin*", "consolidated*"])
             else:
                 ignore_patterns.append("*.safetensors*")
-
-            model_path = snapshot_download(model_path, ignore_patterns=ignore_patterns)
+            
+            if download_kwargs is None:
+                download_kwargs = {}
+            
+            if "ignore_patterns" in download_kwargs:
+                download_kwargs_ignore_patterns = download_kwargs.pop("ignore_patterns")
+
+                if isinstance(download_kwargs_ignore_patterns, str):
+                    ignore_patterns.append(download_kwargs_ignore_patterns)
+                elif isinstance(download_kwargs_ignore_patterns, list):
+                    ignore_patterns.extend(download_kwargs_ignore_patterns)
+
+            model_path = snapshot_download(model_path, ignore_patterns=ignore_patterns, **download_kwargs)
 
         if model_filename != "":
             model_weights_path = model_path + f"/{model_filename}"

From 33dfb04853310e52fa30abf93af9d6ed85550855 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=B9=B4?=
 <48116214+shaonianyr@users.noreply.github.com>
Date: Sat, 6 Apr 2024 21:06:04 +0800
Subject: [PATCH 13/18] add starcoder2 support (#406)

Co-authored-by: charrli <charrli@tencent.com>
---
 awq/models/__init__.py   |   1 +
 awq/models/auto.py       |   1 +
 awq/models/base.py       |   1 +
 awq/models/starcoder2.py | 141 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 144 insertions(+)
 create mode 100644 awq/models/starcoder2.py

diff --git a/awq/models/__init__.py b/awq/models/__init__.py
index 75542fe4..b2496170 100644
--- a/awq/models/__init__.py
+++ b/awq/models/__init__.py
@@ -15,3 +15,4 @@
 from .mixtral import MixtralAWQForCausalLM
 from .qwen2 import Qwen2AWQForCausalLM
 from .gemma import GemmaAWQForCausalLM
+from .starcoder2 import Starcoder2AWQForCausalLM
\ No newline at end of file
diff --git a/awq/models/auto.py b/awq/models/auto.py
index cf35a279..a99b7a75 100644
--- a/awq/models/auto.py
+++ b/awq/models/auto.py
@@ -24,6 +24,7 @@
     "llava": LlavaAWQForCausalLM,
     "qwen2": Qwen2AWQForCausalLM,
     "gemma": GemmaAWQForCausalLM,
+    "starcoder2": Starcoder2AWQForCausalLM,
 }
 
 
diff --git a/awq/models/base.py b/awq/models/base.py
index 12607348..f32576b8 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -68,6 +68,7 @@
     "llava": "AutoModelForVision2Seq",
     "qwen2": "AutoModelForCausalLM",
     "gemma": "AutoModelForCausalLM",
+    "starcoder2": "AutoModelForCausalLM",
 }
 
 
diff --git a/awq/models/starcoder2.py b/awq/models/starcoder2.py
new file mode 100644
index 00000000..2e493514
--- /dev/null
+++ b/awq/models/starcoder2.py
@@ -0,0 +1,141 @@
+import tqdm
+from typing import List, Tuple
+from .base import BaseAWQForCausalLM
+from awq.utils.fused_utils import fuse_qkv
+from awq.modules.fused.block import LlamaLikeBlock
+from awq.modules.fused.model import LlamaLikeModel
+from transformers.models.starcoder2.modeling_starcoder2 import (
+    Starcoder2ForCausalLM as OldStarcoder2ForCausalLM,
+    Starcoder2DecoderLayer as OldStarcoder2DecoderLayer,
+)
+from awq.modules.fused.norm import FasterTransformerRMSNorm
+
+
+class Starcoder2AWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "Starcoder2DecoderLayer"
+    max_seq_len_key = "max_position_embeddings"
+
+    @staticmethod
+    def fuse_layers(model: OldStarcoder2ForCausalLM):
+        fuser = Starcoder2Fuser(model)
+        fuser.fuse_transformer()
+
+    @staticmethod
+    def get_model_layers(model: OldStarcoder2ForCausalLM):
+        return model.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module: OldStarcoder2DecoderLayer):
+        return dict(
+            is_scalable=True,
+            scale_name="mlp.act",
+            scale_layer=module.mlp.act,
+            scale_shape=module.mlp.c_fc.out_features,
+        )
+        # return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model: OldStarcoder2ForCausalLM, device):
+        model.model.embed_tokens = model.model.embed_tokens.to(device)
+
+    @staticmethod
+    def get_layers_for_scaling(module: OldStarcoder2DecoderLayer, input_feat, module_kwargs):
+        layers = []
+
+        # attention input
+        layers.append(
+            dict(
+                prev_op=module.input_layernorm,
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                kwargs=module_kwargs,
+            )
+        )
+
+        # attention out
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
+            layers.append(
+                dict(
+                    prev_op=module.self_attn.v_proj,
+                    layers=[module.self_attn.o_proj],
+                    inp=input_feat["self_attn.o_proj"],
+                )
+            )
+
+        # linear 1
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.c_fc],
+                inp=input_feat["mlp.c_fc"],
+                module2inspect=module.mlp,
+            )
+        )
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.act,
+                layers=[module.mlp.c_proj],
+                inp=input_feat["mlp.c_proj"],
+            )
+        )
+
+        return layers
+
+class Starcoder2Fuser:
+    def __init__(self, model: OldStarcoder2ForCausalLM):
+        self.model = model
+
+        self.starcoder2_blocks: List[Tuple[str, OldStarcoder2DecoderLayer]] = [
+            (name, module)
+            for name, module in self.model.named_modules()
+            if "Starcoder2DecoderLayer".lower() in module.__class__.__name__.lower()
+        ]
+
+    def fuse_transformer(self):
+        blocks = []
+
+        module: OldStarcoder2DecoderLayer
+        for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
+            device = next(iter(module.state_dict().values())).device
+            qkv = fuse_qkv(
+                module,
+                module.self_attn.q_proj,
+                module.self_attn.k_proj,
+                module.self_attn.v_proj,
+            )
+            norm_1 = FasterTransformerRMSNorm(
+                module.input_layernorm.weight, module.input_layernorm.eps
+            )
+            norm_2 = FasterTransformerRMSNorm(
+                module.post_attention_layernorm.weight,
+                module.post_attention_layernorm.eps,
+            )
+            blocks.append(
+                LlamaLikeBlock(
+                    hidden_size=self.model.config.hidden_size,
+                    n_heads=self.model.config.num_attention_heads,
+                    n_kv_heads=self.model.config.num_key_value_heads,
+                    qkv_layer=qkv,
+                    o_proj=module.self_attn.o_proj,
+                    mlp=module.mlp,
+                    norm_1=norm_1,
+                    norm_2=norm_2,
+                    dev=device,
+                    max_seq_len=self.model.config.max_seq_len,
+                )
+            )
+
+        self.model.model = LlamaLikeModel(
+            self.model.config.vocab_size,
+            blocks,
+            self.model.model.embed_tokens,
+            self.model.model.norm,
+        )
+        setattr(self.model.model, "blocks", self.model.model.blocks)
\ No newline at end of file

From e9f62694a867a7a0b2f5e469fcbd914ce5ae0970 Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Sat, 6 Apr 2024 21:15:28 +0800
Subject: [PATCH 14/18] Add StableLM support (#410)

Co-authored-by: Casper <casperbh.96@gmail.com>
---
 awq/models/__init__.py     |   3 +-
 awq/models/auto.py         |   1 +
 awq/models/base.py         |   1 +
 awq/models/stablelm.py     | 136 +++++++++++++++++++++++++++++++++++++
 awq/modules/fused/attn.py  |  30 +++++---
 awq/modules/fused/block.py |   2 +
 6 files changed, 164 insertions(+), 9 deletions(-)
 create mode 100644 awq/models/stablelm.py

diff --git a/awq/models/__init__.py b/awq/models/__init__.py
index b2496170..2ae3fd55 100644
--- a/awq/models/__init__.py
+++ b/awq/models/__init__.py
@@ -15,4 +15,5 @@
 from .mixtral import MixtralAWQForCausalLM
 from .qwen2 import Qwen2AWQForCausalLM
 from .gemma import GemmaAWQForCausalLM
-from .starcoder2 import Starcoder2AWQForCausalLM
\ No newline at end of file
+from .stablelm import StableLmAWQForCausalLM
+from .starcoder2 import Starcoder2AWQForCausalLM
diff --git a/awq/models/auto.py b/awq/models/auto.py
index a99b7a75..0a236979 100644
--- a/awq/models/auto.py
+++ b/awq/models/auto.py
@@ -24,6 +24,7 @@
     "llava": LlavaAWQForCausalLM,
     "qwen2": Qwen2AWQForCausalLM,
     "gemma": GemmaAWQForCausalLM,
+    "stablelm": StableLmAWQForCausalLM,
     "starcoder2": Starcoder2AWQForCausalLM,
 }
 
diff --git a/awq/models/base.py b/awq/models/base.py
index f32576b8..ebd45ccc 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -68,6 +68,7 @@
     "llava": "AutoModelForVision2Seq",
     "qwen2": "AutoModelForCausalLM",
     "gemma": "AutoModelForCausalLM",
+    "stablelm": "AutoModelForCausalLM",
     "starcoder2": "AutoModelForCausalLM",
 }
 
diff --git a/awq/models/stablelm.py b/awq/models/stablelm.py
new file mode 100644
index 00000000..b4ad8bb8
--- /dev/null
+++ b/awq/models/stablelm.py
@@ -0,0 +1,136 @@
+import tqdm
+from typing import List, Tuple
+from .base import BaseAWQForCausalLM
+from awq.utils.fused_utils import fuse_qkv
+from awq.modules.fused.block import LlamaLikeBlock
+from awq.modules.fused.model import LlamaLikeModel
+from transformers.models.stablelm import StableLmForCausalLM as OldStableLmForCausalLM
+from transformers.models.stablelm.modeling_stablelm import (
+    StableLmDecoderLayer as OldStableLmDecoderLayer,
+)
+from awq.modules.fused.norm import FasterTransformerRMSNorm
+
+
+class StableLmAWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "StableLmDecoderLayer"
+    max_seq_len_key = "max_position_embeddings"
+
+    @staticmethod
+    def fuse_layers(model: OldStableLmForCausalLM):
+        fuser = StableLmFuser(model)
+        fuser.fuse_transformer()
+
+    @staticmethod
+    def get_model_layers(model: OldStableLmForCausalLM):
+        return model.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module: OldStableLmForCausalLM):
+        return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model: OldStableLmForCausalLM, device: str):
+        model.model.embed_tokens = model.model.embed_tokens.to(device)
+
+    @staticmethod
+    def get_layers_for_scaling(
+        module: OldStableLmDecoderLayer, input_feat, module_kwargs
+    ):
+        layers = []
+
+        # attention input
+        layers.append(
+            dict(
+                prev_op=module.input_layernorm,
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                kwargs=module_kwargs,
+            )
+        )
+
+        # attention out
+        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
+            layers.append(
+                dict(
+                    prev_op=module.self_attn.v_proj,
+                    layers=[module.self_attn.o_proj],
+                    inp=input_feat["self_attn.o_proj"],
+                )
+            )
+
+        # linear 1
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.gate_proj, module.mlp.up_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp,
+            )
+        )
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.up_proj,
+                layers=[module.mlp.down_proj],
+                inp=input_feat["mlp.down_proj"],
+            )
+        )
+
+        return layers
+
+
+class StableLmFuser:
+    def __init__(self, model: OldStableLmForCausalLM):
+        self.model = model
+
+        self.stablelm_blocks: List[Tuple[str, OldStableLmDecoderLayer]] = [
+            (name, module)
+            for name, module in self.model.named_modules()
+            if "StableLmDecoderLayer".lower() in module.__class__.__name__.lower()
+        ]
+
+    def fuse_transformer(self):
+        blocks = []
+
+        module: OldStableLmDecoderLayer
+        for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
+            device = next(iter(module.state_dict().values())).device
+            qkv = fuse_qkv(
+                module,
+                module.self_attn.q_proj,
+                module.self_attn.k_proj,
+                module.self_attn.v_proj,
+            )
+            norm_1 = module.input_layernorm
+            norm_2 = module.post_attention_layernorm
+            blocks.append(
+                LlamaLikeBlock(
+                    hidden_size=self.model.config.hidden_size,
+                    n_heads=self.model.config.num_attention_heads,
+                    n_kv_heads=self.model.config.num_key_value_heads,
+                    qkv_layer=qkv,
+                    o_proj=module.self_attn.o_proj,
+                    mlp=module.mlp,
+                    norm_1=norm_1,
+                    norm_2=norm_2,
+                    dev=device,
+                    max_seq_len=self.model.config.max_seq_len,
+                    rope_theta=self.model.config.rope_theta,
+                    partial_rotary_factor=self.model.config.partial_rotary_factor,
+                )
+            )
+
+        self.model.model = LlamaLikeModel(
+            self.model.config.vocab_size,
+            blocks,
+            self.model.model.embed_tokens,
+            self.model.model.norm,
+        )
+        setattr(self.model.model, "blocks", self.model.model.blocks)
diff --git a/awq/modules/fused/attn.py b/awq/modules/fused/attn.py
index 9775126b..e334dd7f 100644
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -29,9 +29,7 @@ def __init__(self, head_dim, max_seq_len, device, rope_theta):
         super(RoPE, self).__init__()
 
         self.freqs_cis = nn.Parameter(
-            self.precompute_freqs_cis(
-                head_dim, max_seq_len * 2, rope_theta
-            ).to(device),
+            self.precompute_freqs_cis(head_dim, max_seq_len * 2, rope_theta).to(device),
             requires_grad=False,
         )
 
@@ -118,6 +116,7 @@ def __init__(
         use_alibi=False,
         attention_shapes=None,
         rope_theta=10000,
+        partial_rotary_factor=1.0,
         head_dim=None,
         **kwargs
     ):
@@ -127,7 +126,7 @@ def __init__(
         self.n_kv_heads = n_kv_heads
         self.n_kv_groups = n_heads // n_kv_heads if n_kv_heads != 0 else 0
         self.head_dim = head_dim
-        
+
         if head_dim is None:
             self.head_dim = hidden_size // n_heads
 
@@ -167,8 +166,9 @@ def __init__(
             self.is_neox = False
         else:
             self.alibi = None
-            self.rope = RoPE(self.head_dim, max_seq_len, dev, rope_theta)
-            self.rotary_dim = self.head_dim
+            self.partial_rotary_factor = partial_rotary_factor
+            self.rotary_dim = int(self.head_dim * self.partial_rotary_factor)
+            self.rope = RoPE(self.rotary_dim, max_seq_len, dev, rope_theta)
             self.is_neox = True
 
     def forward(
@@ -209,13 +209,27 @@ def forward(
         xk = self.attention_shapes["xk_slice"](xqkv)
         xv = self.attention_shapes["xv_slice"](xqkv)
 
-        if seqlen > 1 or not FT_INSTALLED:
+        if seqlen > 1 or self.partial_rotary_factor < 1 or not FT_INSTALLED:
             xq = xq.view((bsz, seqlen) + self.attention_shapes["xq_view"])
             xk = xk.view((bsz, seqlen) + self.attention_shapes["xk_view"])
             xv = xv.view((bsz, seqlen) + self.attention_shapes["xv_view"])
 
             if not self.use_alibi:
-                xq, xk = self.rope.forward(xq, xk, self.start_pos, seqlen)
+                # Partial rotary embedding
+                if self.partial_rotary_factor < 1:
+                    xq_rot, xq_pass = (
+                        xq[..., : self.rotary_dim],
+                        xq[..., self.rotary_dim :],
+                    )
+                    xk_rot, xk_pass = (
+                        xk[..., : self.rotary_dim],
+                        xk[..., self.rotary_dim :],
+                    )
+                    xq_rot, xk_rot = self.rope.forward(xq_rot, xk_rot, self.start_pos, seqlen)
+                    xq = torch.cat((xq_rot, xq_pass), dim=-1)
+                    xk = torch.cat((xk_rot, xk_pass), dim=-1)
+                else:
+                    xq, xk = self.rope.forward(xq, xk, self.start_pos, seqlen)
 
             values_store = xv.transpose(2, 1)
             keys_store = (
diff --git a/awq/modules/fused/block.py b/awq/modules/fused/block.py
index 23cd954d..e1971e37 100644
--- a/awq/modules/fused/block.py
+++ b/awq/modules/fused/block.py
@@ -79,6 +79,7 @@ def __init__(
         dev,
         max_seq_len,
         rope_theta=10000,
+        partial_rotary_factor=1.0,
         use_alibi=False,
         head_dim=None,
     ):
@@ -103,6 +104,7 @@ def __init__(
             max_seq_len=max_seq_len,
             use_alibi=use_alibi,
             rope_theta=rope_theta,
+            partial_rotary_factor=partial_rotary_factor,
             head_dim=head_dim,
         ).to(dev)
         self.norm_2 = norm_2.to(dev)

From 0e52a5c236f897db96848c9d9f6071092698813c Mon Sep 17 00:00:00 2001
From: TechxGenus <jianghao0728@mail.ustc.edu.cn>
Date: Sat, 13 Apr 2024 00:46:46 +0800
Subject: [PATCH 15/18] Fix starcoder2 fused norm (#442)

---
 awq/models/starcoder2.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/awq/models/starcoder2.py b/awq/models/starcoder2.py
index 2e493514..be79506b 100644
--- a/awq/models/starcoder2.py
+++ b/awq/models/starcoder2.py
@@ -110,13 +110,9 @@ def fuse_transformer(self):
                 module.self_attn.k_proj,
                 module.self_attn.v_proj,
             )
-            norm_1 = FasterTransformerRMSNorm(
-                module.input_layernorm.weight, module.input_layernorm.eps
-            )
-            norm_2 = FasterTransformerRMSNorm(
-                module.post_attention_layernorm.weight,
-                module.post_attention_layernorm.eps,
-            )
+            # SC2 use normal LayerNorm
+            norm_1 = module.input_layernorm
+            norm_2 = module.post_attention_layernorm
             blocks.append(
                 LlamaLikeBlock(
                     hidden_size=self.model.config.hidden_size,

From edcf780fceef5bacff0c8795d92d927df22e0717 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Thu, 18 Apr 2024 19:09:05 +0200
Subject: [PATCH 16/18] Update generate example to llama 3 (#448)

---
 examples/generate.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/examples/generate.py b/examples/generate.py
index 043e3487..c14f9086 100644
--- a/examples/generate.py
+++ b/examples/generate.py
@@ -2,28 +2,36 @@
 from transformers import AutoTokenizer, TextStreamer
 
 
-quant_path = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
+quant_path = "casperhansen/llama-3-8b-instruct-awq"
 
 # Load model
 model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
 tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
-# Convert prompt to tokens
-prompt_template = "[INST] {prompt} [/INST]"
-
 prompt = "You're standing on the surface of the Earth. "\
         "You walk one mile south, one mile west and one mile north. "\
         "You end up exactly where you started. Where are you?"
 
-tokens = tokenizer(
-    prompt_template.format(prompt=prompt), 
-    return_tensors='pt'
-).input_ids.cuda()
+chat = [
+    {"role": "system", "content": "You are a concise assistant that helps answer questions."},
+    {"role": "user", "content": prompt},
+]
+
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+
+tokens = tokenizer.apply_chat_template(
+    chat,
+    return_tensors="pt"
+).cuda()
 
 # Generate output
 generation_output = model.generate(
     tokens, 
     streamer=streamer,
-    max_new_tokens=512
-)
\ No newline at end of file
+    max_new_tokens=64,
+    eos_token_id=terminators
+)

From 24b98c251b87c21b04bfc7e28dc803392da6ce21 Mon Sep 17 00:00:00 2001
From: Shaun Prince <shaun@techfusion.ca>
Date: Thu, 18 Apr 2024 12:39:42 -0700
Subject: [PATCH 17/18] [BUG] Fix github action documentation build (#449)

---
 .github/workflows/docs.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 1cec7346..076d238f 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -16,7 +16,7 @@ jobs:
           git config user.email 41898282+github-actions[bot]@users.noreply.github.com
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.x
+          python-version: 3.11
       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
       - uses: actions/cache@v3
         with:
@@ -25,4 +25,4 @@ jobs:
           restore-keys: |
             mkdocs-material-docs
       - run: pip install mkdocstrings-python mkdocs-material griffe-typingdoc
-      - run: mkdocs gh-deploy --force
\ No newline at end of file
+      - run: mkdocs gh-deploy --force

From 4fc6cc03a34f8b58b8ddc8cc04fafe39c775d991 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Fri, 19 Apr 2024 10:42:47 +0200
Subject: [PATCH 18/18] Fix path (#451)

---
 mkdocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 243ab9ea..9c25c4fe 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -32,7 +32,7 @@ plugins:
   mkdocstrings:
     handlers:
       python:
-        paths: [awq]
+        paths: [.]
         options:
           extensions:
           - griffe_typingdoc