casper-hansen · MeJerry215 · Jan 4, 2024
diff --git a/awq/models/__init__.py b/awq/models/__init__.py
@@ -13,3 +13,4 @@
 from .baichuan import BaichuanAWQForCausalLM
 from .llava import LlavaAWQForCausalLM
 from .mixtral import MixtralAWQForCausalLM
+from .codeshell import CodeShellAWQForCausalLM
diff --git a/awq/models/auto.py b/awq/models/auto.py
@@ -21,6 +21,7 @@
     "qwen": QwenAWQForCausalLM,
     "baichuan": BaichuanAWQForCausalLM,
     "llava": LlavaAWQForCausalLM,
+    "codeshell": CodeShellAWQForCausalLM,
 }
 
 def check_and_get_model_type(model_dir, trust_remote_code=True, **model_init_kwargs):

diff --git a/awq/models/base.py b/awq/models/base.py
@@ -57,6 +57,7 @@
     "qwen": "AutoModelForCausalLM",
     "baichuan": "AutoModelForCausalLM",
     "llava": "AutoModelForVision2Seq",
+    "codeshell": "AutoModelForCausalLM",
 }
 
 class BaseAWQForCausalLM(nn.Module):

diff --git a/awq/models/codeshell.py b/awq/models/codeshell.py
@@ -0,0 +1,51 @@
+from .base import BaseAWQForCausalLM
+
+class CodeShellAWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "CodeShellBlock"
+    max_new_tokens_key = "n_positions"
+
+    @staticmethod
+    def get_model_layers(model):
+        return model.transformer.h
+
+    @staticmethod
+    def get_act_for_scaling(module):
+        return dict(
+            is_scalable=True,
+            scale_name="mlp.act",
+            scale_layer=module.mlp.act,
+            scale_shape=module.mlp.c_fc.out_features
+        )
+
+    @staticmethod
+    def move_embed(model, device: str):
+        model.transformer.wte = model.transformer.wte.to(device)
+
+
+    @staticmethod
+    def get_layers_for_scaling(module, input_feat, module_kwargs):
+        layers = []
+        # attention
+        layers.append(
+            dict(
+                prev_op=module.ln_1,
+                layers=[module.attn.c_attn],
+                inp=input_feat['attn.c_attn'],
+            )
+        )
+        # mlp
+        layers.append(
+            dict(
+                prev_op=module.ln_2,
+                layers=[module.mlp.c_fc],
+                inp=input_feat['mlp.c_fc']
+            )
+        )
+        layers.append(
+            dict(
+                prev_op=module.mlp.act,
+                layers=[module.mlp.c_proj],
+                inp=input_feat['mlp.c_proj']
+            )
+        )
+        return layers