From 208b2a02a43cebfcbe6a31410912689cfd3b13e3 Mon Sep 17 00:00:00 2001 From: Faraz Shahsavan Date: Wed, 11 Dec 2024 21:08:30 +0000 Subject: [PATCH] Format vllm code --- .../cutlass_benchmarks/sp_fp8_benchmarks.py | 25 +++++++++---------- csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu | 18 ++++++------- csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 8 +++--- tests/kernels/test_cutlass.py | 20 ++++++++------- vllm/_custom_ops.py | 7 +++--- 5 files changed, 38 insertions(+), 40 deletions(-) diff --git a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py index 0dd59c708d9cd..fb7c1d8fcd82d 100644 --- a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py @@ -1,11 +1,11 @@ import argparse import copy -import itertools -import pickle as pkl -import time import dataclasses +import itertools import multiprocessing as mp import os +import pickle as pkl +import time import traceback from multiprocessing import Process, Queue from pathlib import Path @@ -15,11 +15,11 @@ import torch import torch.utils.benchmark as TBenchmark from torch.utils.benchmark import Measurement as TMeasurement -from weight_shapes import WEIGHT_SHAPES -from vllm.utils import FlexibleArgumentParser -import vllm._custom_ops as ops from utils import make_n_rand_sparse_tensors +from weight_shapes import WEIGHT_SHAPES +import vllm._custom_ops as ops +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -490,8 +490,8 @@ def run_kernels_on_gpus( bench = BenchMM(cuda_graph_params, label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm", ops.cutlass_scaled_sparse_mm, ArgPool(As), - ArgPool(BComps), ArgPool(Es), - scale_a, scale_b, torch.bfloat16) + ArgPool(BComps), ArgPool(Es), scale_a, + scale_b, torch.bfloat16) # Run the benchmark result = bench.run() @@ -575,8 +575,8 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int], def bench(dtype: torch.dtype, with_cuda_graph: Optional[int], - with_arg_pool: Optional[int], m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: + with_arg_pool: Optional[int], m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: if dtype == torch.float8_e4m3fn: return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label, sub_label) @@ -599,9 +599,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: if args.with_cuda_graph else label label = f"{label}-argpool_{args.with_arg_pool}" \ if args.with_arg_pool else label - timers = bench(args.dtype, args.with_cuda_graph, - args.with_arg_pool, m, k, n, label, - f"MKN=({m}x{k}x{n})") + timers = bench(args.dtype, args.with_cuda_graph, args.with_arg_pool, m, + k, n, label, f"MKN=({m}x{k}x{n})") print_timers(timers) results.extend(timers) diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu index a4f5eaed4134f..7ee5246a52d79 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu @@ -297,8 +297,7 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, } } -void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, - torch::Tensor const& a, +void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& e, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -317,15 +316,12 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, } } -void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, - torch::Tensor const& a, - torch::Tensor const& e, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { +void cutlass_scaled_sparse_mm_azp_sm90( + torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& e, + torch::Tensor const& b, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& azp_adj, + c10::optional const& azp, + c10::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index 8a92d3a598964..9e23df5a05d69 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -51,10 +51,10 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(b.stride(1) % 16 == 0); // 16 Byte Alignment + TORCH_CHECK(a.stride(1) == 1); // Row-major + TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major + TORCH_CHECK(c.stride(1) % 16 == 0); // 16 Byte Alignment + TORCH_CHECK(b.stride(1) % 16 == 0); // 16 Byte Alignment TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); if (bias) { diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 2c5d19cc54c54..aa7f517ce6ff0 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -2,7 +2,7 @@ Run `pytest tests/kernels/test_cutlass.py`. """ -from typing import Optional, Type, Tuple +from typing import Optional, Tuple, Type import pytest import torch @@ -86,8 +86,9 @@ def prune_to_2_4(tensor): return pruned.reshape(original_shape) -def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: +def make_rand_sparse_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5 b = torch.randn((n, k), device='cuda').t() * 5 @@ -464,7 +465,8 @@ def test_cutlass_sparse_subset(): m, n, k = 512, 512, 512 # Create tensors - b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, big_m, n, k) + b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, + big_m, n, k) a = whole_a[0:m, 0:k] scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 @@ -472,11 +474,11 @@ def test_cutlass_sparse_subset(): print("in test") out = ops.cutlass_scaled_sparse_mm(a, - b_comp, - e, - scale_a, - scale_b, - out_dtype=torch.bfloat16) + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) baseline = baseline_scaled_mm(a, b, scale_a, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index c89c7d492f75d..6087247de5a94 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -560,8 +560,8 @@ def cutlass_compress_entry(a: torch.Tensor) \ def cutlass_scaled_sparse_mm( - a: torch.Tensor, # row-major activations - b: torch.Tensor, # row-major weight matrix + a: torch.Tensor, # row-major activations + b: torch.Tensor, # row-major weight matrix e: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, @@ -578,7 +578,8 @@ def cutlass_scaled_sparse_mm( n = a_t.shape[1] out = torch.empty((n, m), dtype=out_dtype, device=a.device).t() - torch.ops._C.cutlass_scaled_sparse_mm(out, b, e, a_t, scale_b, scale_a, bias) + torch.ops._C.cutlass_scaled_sparse_mm(out, b, e, a_t, scale_b, scale_a, + bias) return out.t()