format

neuralmagic · Jul 4, 2024 · f629593 · f629593 · github-actions · Jul 4, 2024
1 parent a0d7f77
commit f629593
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 1 deletion.
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
@@ -38,6 +38,7 @@ def torch_moe(a, w1, w2, score, topk):
     return (out.view(B, -1, w2.shape[1]) *
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
+
 def torch_moe_single(a, w, score, topk):
     B, D = a.shape
     a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
@@ -51,6 +52,7 @@ def torch_moe_single(a, w, score, topk):
             out[mask] = a[mask] @ w[i].transpose(0, 1)
     return (out.view(B, -1, w.shape[1])).sum(dim=1)
 
+
 @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,6 +1,6 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_experts, fused_marlin_moe, fused_moe, fused_topk,
-    get_config_file_name, single_marlin_moe, grouped_topk)
+    get_config_file_name, grouped_topk, single_marlin_moe)
 
 __all__ = [
     "fused_moe",

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -367,6 +367,7 @@ def fused_topk(
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
     return topk_weights, topk_ids
 
+
 # This is used by the Deepseek-V2 model
 def grouped_topk(
     hidden_states: torch.Tensor,
@@ -412,6 +413,7 @@ def get_expert_config(w1: torch.Tensor, w2: torch.Tensor, topk: int, M: int,
         return get_default_config(M, E, N, w1.shape[2], topk,
                                   "float8" if use_fp8 else None)
 
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
Benchmark suite	Current: `f629593`	Previous: `f6f2554`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`183.48948831311037` ms	`187.90222024999898` ms	`0.98`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`84.226856744941` ms	`83.68341362953504` ms	`1.01`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`25.00361027327017` ms	`24.958321440002084` ms	`1.00`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.221973479174864` ms	`6.288577137086927` ms	`0.99`