Cleanup, format, minor fixes

neuralmagic · Jul 12, 2024 · d8b455f · d8b455f · github-actions · Jul 12, 2024
1 parent c469b74
commit d8b455f
Show file tree

Hide file tree

Showing 7 changed files with 187 additions and 941 deletions.
diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
@@ -23,10 +23,7 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
-// #include "marlin_moe_ops.h"
-
 #include <iostream>
-// #include <torch/extension.h>
 
 template <typename T>
 inline std::string str(T x) {

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -260,9 +260,10 @@ def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
     return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
                                            num_bits)
 
+
 def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
-                       size_k: int, size_n: int,
-                       num_bits: int) -> torch.Tensor:
+                           size_k: int, size_n: int,
+                           num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     output = torch.empty((num_experts, size_k // 16, size_n * 2),
                          device=b_q_weight.device,

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -737,17 +737,6 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     E = w1.shape[0]
     N = w2.shape[1] * 16
 
-    # print("hidden_states shape:", hidden_states)
-    # print("w1 shape:", w1)
-    # print("w2 shape:", w2)
-    # print("gating_output shape:", gating_output)
-    # print("g_idx1 shape:", g_idx1)
-    # print("g_idx2 shape:", g_idx2)
-    # print("w1_scale shape:", w1_scale)
-    # print("w2_scale shape:", w2_scale)
-
-    # raise ValueError("stop")
-
     topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
                                         renormalize)
 

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -222,51 +222,6 @@ def extra_repr(self) -> str:
         s += f", output_features={self.output_size}"
         s += f", bias={self.bias is not None}"
         return s
-
-class FusedLinearMarlin(LinearBase):
-
-    """
-    Args:
-        input_size: input dimension of the linear layer.
-        output_size: output dimension of the linear layer.
-        bias: If true, add bias.
-        skip_bias_add: If true, skip adding bias but instead return it.
-        params_dtype: Data type for the parameters.
-        quant_config: Quantization configure.
-    """
-
-    def __init__(self,
-                 input_size13: int,
-                 input_size2: int,
-                 quant_config: Optional[QuantizationConfig] = None):
-        # calling with inputsize13 is a bit of an ugly workaround,
-        # it's not used for anything
-        super().__init__(input_size13, input_size13, False, None,
-                         quant_config)
-        self.input_size13 = input_size13
-        self.input_size2 = input_size2
-        self.output_size13 = input_size2
-        self.output_size2 = input_size13
-
-        # All the linear layer supports quant method.
-        assert self.quant_method is not None
-        self.quant_method.create_weights(self, self.input_size13, self.input_size2,
-                                         self.output_size13, self.output_size2,
-                                         self.input_size13, self.input_size2,
-                                         self.params_dtype)
-
-        self.register_parameter("bias", None)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        assert self.quant_method is not None
-        output = self.quant_method.apply(self, x, None)
-        return output, None
-
-    def extra_repr(self) -> str:
-        s = f"in_features={self.input_size}"
-        s += f", output_features={self.output_size}"
-        s += f", bias={self.bias is not None}"
-        return s
 
 
 class ColumnParallelLinear(LinearBase):
Benchmark suite	Current: `d8b455f`	Previous: `9daca33`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`29.921924228159092` ms
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`10.983376331405942` ms
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`41.91395126127948` ms
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`7.755191501705505` ms
Benchmark suite	Current: `d8b455f`	Previous: `9daca33`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`23.72992594333217` ms	`24.225503546652668` ms	`0.98`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.105954250486794` ms	`6.191982505897308` ms	`0.99`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`187.05185744999275` ms	`182.77201948004708` ms	`1.02`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`85.11478736983392` ms	`84.39948871030083` ms	`1.01`