Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Cleanup, format, minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ElizaWszola committed Jul 12, 2024
1 parent c469b74 commit d8b455f
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 941 deletions.
3 changes: 0 additions & 3 deletions csrc/moe/marlin_moe_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,7 @@
#include <cuda_fp16.h>
#include <cuda_runtime.h>

// #include "marlin_moe_ops.h"

#include <iostream>
// #include <torch/extension.h>

template <typename T>
inline std::string str(T x) {
Expand Down
5 changes: 3 additions & 2 deletions vllm/_custom_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,10 @@ def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
num_bits)


def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
size_k: int, size_n: int,
num_bits: int) -> torch.Tensor:
size_k: int, size_n: int,
num_bits: int) -> torch.Tensor:
num_experts = b_q_weight.shape[0]
output = torch.empty((num_experts, size_k // 16, size_n * 2),
device=b_q_weight.device,
Expand Down
11 changes: 0 additions & 11 deletions vllm/model_executor/layers/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,17 +737,6 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
E = w1.shape[0]
N = w2.shape[1] * 16

# print("hidden_states shape:", hidden_states)
# print("w1 shape:", w1)
# print("w2 shape:", w2)
# print("gating_output shape:", gating_output)
# print("g_idx1 shape:", g_idx1)
# print("g_idx2 shape:", g_idx2)
# print("w1_scale shape:", w1_scale)
# print("w2_scale shape:", w2_scale)

# raise ValueError("stop")

topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
renormalize)

Expand Down
194 changes: 85 additions & 109 deletions vllm/model_executor/layers/fused_moe/layer.py

Large diffs are not rendered by default.

45 changes: 0 additions & 45 deletions vllm/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,51 +222,6 @@ def extra_repr(self) -> str:
s += f", output_features={self.output_size}"
s += f", bias={self.bias is not None}"
return s

class FusedLinearMarlin(LinearBase):

"""
Args:
input_size: input dimension of the linear layer.
output_size: output dimension of the linear layer.
bias: If true, add bias.
skip_bias_add: If true, skip adding bias but instead return it.
params_dtype: Data type for the parameters.
quant_config: Quantization configure.
"""

def __init__(self,
input_size13: int,
input_size2: int,
quant_config: Optional[QuantizationConfig] = None):
# calling with inputsize13 is a bit of an ugly workaround,
# it's not used for anything
super().__init__(input_size13, input_size13, False, None,
quant_config)
self.input_size13 = input_size13
self.input_size2 = input_size2
self.output_size13 = input_size2
self.output_size2 = input_size13

# All the linear layer supports quant method.
assert self.quant_method is not None
self.quant_method.create_weights(self, self.input_size13, self.input_size2,
self.output_size13, self.output_size2,
self.input_size13, self.input_size2,
self.params_dtype)

self.register_parameter("bias", None)

def forward(self, x: torch.Tensor) -> torch.Tensor:
assert self.quant_method is not None
output = self.quant_method.apply(self, x, None)
return output, None

def extra_repr(self) -> str:
s = f"in_features={self.input_size}"
s += f", output_features={self.output_size}"
s += f", bias={self.bias is not None}"
return s


class ColumnParallelLinear(LinearBase):
Expand Down
Loading

2 comments on commit d8b455f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smaller_is_better

Benchmark suite Current: d8b455f Previous: 9daca33 Ratio
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 29.921924228159092 ms
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 10.983376331405942 ms
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 41.91395126127948 ms
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 7.755191501705505 ms

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smaller_is_better

Benchmark suite Current: d8b455f Previous: 9daca33 Ratio
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 23.72992594333217 ms 24.225503546652668 ms 0.98
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 6.105954250486794 ms 6.191982505897308 ms 0.99
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 187.05185744999275 ms 182.77201948004708 ms 1.02
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 85.11478736983392 ms 84.39948871030083 ms 1.01

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.