diff --git a/CMakeLists.txt b/CMakeLists.txt
index 943424bc4edfa..0ebe0bcf4e8aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,18 +202,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        # GIT_SHALLOW FALSE
   )
   FetchContent_MakeAvailable(cutlass)
 
@@ -225,7 +225,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gguf/gguf_kernel.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -255,11 +257,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+  # For Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+             "csrc/sparse/cutlass/sparse_compressor.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -268,12 +273,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
+                     "later if you intend on running FP8 quantized models or sparse on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building cutlass_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -398,6 +403,9 @@ define_gpu_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
+# include(nm_cutlass_c.cmake)
+# build_nm_cutlass_c()
+
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
new file mode 100644
index 0000000000000..22616c2359b74
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -0,0 +1,311 @@
+## Cutlass benchmark V1
+
+from typing import Callable, Iterable
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+
+import vllm._custom_ops as ops
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+    
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.float16))
+
+    # cutlass with bias: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+
+    # Create tensors
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    aT = a.t()
+    bT = b.t()
+    bf16_a = a.to(dtype=torch.bfloat16)
+    bf16_bT = bT.to(dtype=torch.bfloat16)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 bT.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
+                 torch.bfloat16))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+
+    return timers
+
+
+def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float16
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.float16, m, n, k)
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # # pytorch impl w. bf16
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+    #              b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # pytorch impl: bf16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_bf16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16))
+
+    # # pytorch impl: fp16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_fp16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.bfloat16
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.bfloat16, m, n, k)
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # # pytorch impl w. bf16
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+    #              b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # pytorch impl: bf16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_bf16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16))
+
+    # # pytorch impl: fp16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_fp16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    # if dtype == torch.int8:
+    #     return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.float16:
+    #     return bench_fp16(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.bfloat16:
+    #     return bench_bf16(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
new file mode 100644
index 0000000000000..f9b4871044526
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -0,0 +1,556 @@
+import dataclasses
+import random
+from typing import Any, Callable, Iterable, Optional, Tuple, Dict, List
+
+import multiprocessing as mp
+from multiprocessing import Process, Queue
+from queue import Empty
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_n_rand_sparse_tensors
+
+import vllm._custom_ops as ops
+import traceback
+
+import json
+import os
+import hashlib
+from datetime import datetime
+from pathlib import Path
+
+
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+
+
+@dataclasses.dataclass
+class ArgPool:
+    '''
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    '''
+    values: Iterable[Any]
+
+
+class BenchMM:
+
+    class ArgsIterator:
+
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+
+        def reset(self):
+            self.idx = 0
+
+        @property
+        def n_args(self):
+            return self.n
+
+    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
+                 label: str, sub_label: str, description: str, fn: Callable,
+                 *args, **kwargs):
+
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(
+            *args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list,
+                                               self.kwargs_list)
+
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+
+        # benchmark run params
+        self.min_run_time = 1
+
+    def collapse_argpool(self, *args, **kwargs):
+        kwargs = kwargs if kwargs is not None else {}
+        assert kwargs is None or all([
+            not isinstance(v, ArgPool) for k, v in kwargs.items()
+        ]), 'ArgPools in kwargs are not supported yet'
+
+        arg_pool_indices = [
+            i for i, x in enumerate(args) if isinstance(x, ArgPool)
+        ]
+        if len(arg_pool_indices) == 0:
+            return [args], [kwargs]
+
+        # make sure all the Arg pools have the same number of choices
+        arg_pool_size = len(args[arg_pool_indices[0]].values)
+        assert all(
+            [len(args[i].values) == arg_pool_size for i in arg_pool_indices])
+
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(arg_pool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+
+        # collapse the arg pools by simply choosing the ith value
+        for i in range(arg_pool_size):
+            assert isinstance(args_list[i], tuple)
+            # get as list
+            args_i = list(args_list[i])
+            # collapse - make replacements
+            for arg_pool_idx in arg_pool_indices:
+                val_from_pool = args_i[arg_pool_idx].values[i]
+                args_i[arg_pool_idx] = val_from_pool
+            # store back as tuple
+            args_list[i] = tuple(args_i)
+
+        return args_list, kwargs_list
+
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(5):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+
+    def run_cudagraph(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {'g': self.g}
+
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = '''
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    '''
+            stmt = '''
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    '''
+            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+
+            setup = ""
+            stmt = '''
+                    fn(*args, **kwargs)
+                   '''
+            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
+
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagraph()
+        else:
+            timer = self.run_eager()
+        #assert timer.meets_confidence()
+        #assert not timer.has_warnings, f"Warnings {timer._warnings}"
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")
+
+
+def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue):
+    """
+    Run a single kernel benchmark in an isolated process.
+    Puts (success, result, config) tuple in the queue.
+    """
+    try:
+        torch.cuda.set_device(gpu_id)
+        
+        # Initialize CUDA tensors
+        m, k, n = kernel_config['m'], kernel_config['k'], kernel_config['n']
+        dtype = kernel_config['dtype']
+        
+        # Create tensors
+        BComps, Es, As, Bs = make_n_rand_sparse_tensors(
+            kernel_config.get('arg_pool_size', 1), 
+            dtype, m, n, k
+        )
+        AsT = [x.t() for x in As]
+        BsT = [x.t() for x in Bs]
+        bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
+        bf16_BsT = [x.to(dtype=torch.bfloat16) for x in BsT]
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        # Because the transposed output will be computed
+        out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
+
+        # Setup benchmark params
+        cuda_graph_params = None
+        if cgops := kernel_config.get('cuda_graph_ops'):
+            cuda_graph_params = CudaGraphBenchParams(cgops)
+
+        label = kernel_config['label']
+        sub_label = kernel_config['sub_label']
+
+        # Initialize benchmark based on kernel type
+        bench = None
+        kernel_type = kernel_config['kernel_type']
+
+        if kernel_type == 'pytorch_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "pytorch_bf16_bf16_bf16_matmul-no-scales", 
+                            torch.mm,
+                            ArgPool(bf16_As), ArgPool(bf16_BsT))
+
+        elif kernel_type == 'pytorch_scaled_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "pytorch_fp8_fp8_bf16_scaled_mm",
+                            torch._scaled_mm,
+                            ArgPool(As), ArgPool(BsT),
+                            scale_a=scale_a, scale_b=scale_b,
+                            out_dtype=torch.bfloat16)
+
+        elif kernel_type == 'pytorch_scaled_mm_fast':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                            torch._scaled_mm,
+                            ArgPool(As), ArgPool(BsT),
+                            scale_a=scale_a, scale_b=scale_b,
+                            out_dtype=torch.bfloat16,
+                            use_fast_accum=True)
+
+        elif kernel_type == 'cutlass_scaled_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "cutlass_fp8_fp8_bf16_scaled_mm_default", 
+                            ops.cutlass_scaled_mm,
+                            ArgPool(As), ArgPool(BsT), scale_a, scale_b,
+                            torch.bfloat16)
+
+        elif kernel_type == 'cutlass_sparse_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm_default", 
+                            ops.cutlass_scaled_sparse_mm,
+                            ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
+                            scale_b, scale_a, torch.bfloat16)
+
+
+        # Run the benchmark
+        result = bench.run()
+        queue.put((True, result, kernel_config))
+
+    except Exception as e:
+        print(f"Error in benchmark process: {str(e)}")
+        print(traceback.format_exc())
+        queue.put((False, None, kernel_config))
+    finally:
+        # Explicit cleanup
+        torch.cuda.empty_cache()
+
+def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
+    """Worker process that spawns individual benchmark processes for each kernel."""
+    try:
+        while True:
+            try:
+                kernel_config = task_queue.get_nowait()
+                if kernel_config is None:  # Poison pill
+                    break
+
+                # Create a new process queue for this specific benchmark
+                process_queue = Queue()
+
+                # Create and start a new process for this kernel benchmark
+                p = Process(target=run_single_benchmark_process, 
+                          args=(kernel_config, gpu_id, process_queue))
+                p.start()
+
+                # Wait for result with timeout (5 minutes for benchmarking)
+                try:
+                    success, result, config = process_queue.get(timeout=300)
+                    result_queue.put((success, result, config))
+                except Empty:
+                    print(f"Kernel {kernel_config.get('kernel_type')} benchmark timed out")
+                    result_queue.put((False, None, kernel_config))
+
+                # Cleanup
+                p.join(timeout=1)  # Give it 1 second to join
+                if p.is_alive():
+                    p.terminate()
+                    p.join()
+
+            except Empty:
+                break
+            except Exception as e:
+                print(f"Error in GPU {gpu_id} worker: {str(e)}")
+                print(traceback.format_exc())
+                if 'kernel_config' in locals():
+                    result_queue.put((False, None, kernel_config))
+
+    finally:
+        print(f"GPU {gpu_id} worker finished")
+
+def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
+    MULTI_GPU_MULTI_PROCESS = False  # Set to False for single GPU testing
+    if MULTI_GPU_MULTI_PROCESS:
+        gpus_list = [0]
+        task_queue = Queue()
+        result_queue = Queue()
+
+        configs = configs[:10]
+
+        # Fill task queue
+        for config in configs:
+            task_queue.put(config)
+        for _ in gpus_list:  # Add poison pills
+            task_queue.put(None)
+
+        # Start GPU workers
+        workers = []
+        for gpu_id in gpus_list:
+            p = Process(target=benchmark_gpu_worker, args=(gpu_id, task_queue, result_queue))
+            p.start()
+            workers.append(p)
+
+        # Collect results
+        results = []
+        completed = 0
+        total_tasks = len(configs)
+
+        while completed < total_tasks:
+            success, result, config = result_queue.get()
+            results.append((success, result, config))
+            completed += 1
+
+            # Print progress
+            status = "Success" if success else "Failed"
+            print(f"{status}: {config['kernel_type']}")
+
+        # Cleanup workers
+        for worker in workers:
+            worker.join(timeout=1)
+            if worker.is_alive():
+                worker.terminate()
+                worker.join()
+
+        return results
+    else:
+        """Run kernel benchmarks in a single process."""
+        results = []
+        gpu_id = 0  # Using the same GPU as before
+        torch.cuda.set_device(gpu_id)
+        # configs = configs[:10]  # Keep the original slice
+        
+        for config in configs:
+            try:
+                # Initialize CUDA tensors
+                m, k, n = config['m'], config['k'], config['n']
+                dtype = config['dtype']
+                
+                # Create tensors
+                BComps, Es, As, Bs = make_n_rand_sparse_tensors(
+                    config.get('arg_pool_size', 1), 
+                    dtype, m, n, k
+                )
+                AsT = [x.t() for x in As]
+                BsT = [x.t() for x in Bs]
+                bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
+                bf16_BsT = [x.to(dtype=torch.bfloat16) for x in BsT]
+                scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+                scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+                out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
+
+                # Setup benchmark params
+                cuda_graph_params = None
+                if cgops := config.get('cuda_graph_ops'):
+                    cuda_graph_params = CudaGraphBenchParams(cgops)
+
+                label = config['label']
+                sub_label = config['sub_label']
+
+                # Initialize benchmark based on kernel type
+                bench = None
+                kernel_type = config['kernel_type']
+
+                if kernel_type == 'pytorch_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "pytorch_bf16_bf16_bf16_matmul-no-scales", 
+                                    torch.mm,
+                                    ArgPool(bf16_As), ArgPool(bf16_BsT))
+
+                elif kernel_type == 'pytorch_scaled_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "pytorch_fp8_fp8_bf16_scaled_mm",
+                                    torch._scaled_mm,
+                                    ArgPool(As), ArgPool(BsT),
+                                    scale_a=scale_a, scale_b=scale_b,
+                                    out_dtype=torch.bfloat16)
+
+                elif kernel_type == 'pytorch_scaled_mm_fast':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                                    torch._scaled_mm,
+                                    ArgPool(As), ArgPool(BsT),
+                                    scale_a=scale_a, scale_b=scale_b,
+                                    out_dtype=torch.bfloat16,
+                                    use_fast_accum=True)
+
+                elif kernel_type == 'cutlass_scaled_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "cutlass_fp8_fp8_bf16_scaled_mm_default", 
+                                    ops.cutlass_scaled_mm,
+                                    ArgPool(As), ArgPool(BsT), scale_a, scale_b,
+                                    torch.bfloat16)
+
+                elif kernel_type == 'cutlass_sparse_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "cutlass_fp8_fp8_bf16_scaled_sparse_mm_default", 
+                                    ops.cutlass_scaled_sparse_mm,
+                                    ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
+                                    scale_b, scale_a, torch.bfloat16)
+
+                # Run the benchmark
+                result = bench.run()
+                
+                # Print progress
+                print(f"Success: {kernel_type}")
+                    
+                results.append((True, result, config))
+                
+                # Cleanup
+                torch.cuda.empty_cache()
+
+            except Exception as e:
+                print(f"Error in benchmark: {str(e)}")
+                print(traceback.format_exc())
+                results.append((False, None, config))
+                torch.cuda.empty_cache()
+                
+        return results
+
+
+def get_cache_path() -> str:
+    """Get the path to the cache file for the given configuration hash."""
+    return f'{Path(os.path.dirname(os.path.realpath(__file__)))}/stable_kernels.json'
+
+
+def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
+              with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    
+    # Check if context is not set
+    try:
+        mp.set_start_method('spawn', force=True)
+    except RuntimeError:
+        pass
+    
+    timers = []
+    gpus_list = [5]  # Using the same GPU list as original code
+
+    # Base configuration for all kernels
+    base_config = {
+        'm': m,
+        'k': k,
+        'n': n,
+        'dtype': dtype,
+        'cuda_graph_ops': with_cuda_graph,
+        'arg_pool_size': with_arg_pool if with_arg_pool else 1,
+        'label': label,
+        'sub_label': sub_label
+    }
+    
+    # Prepare configs for all kernels
+    standard_kernels = [
+        {'kernel_type': 'pytorch_mm'},
+        {'kernel_type': 'pytorch_scaled_mm'},
+        {'kernel_type': 'pytorch_scaled_mm_fast'},
+        {'kernel_type': 'cutlass_scaled_mm'},
+        {'kernel_type': 'cutlass_sparse_mm'}
+    ]
+    
+    # Create configs for standard kernels
+    all_configs = [{**base_config, **kernel} for kernel in standard_kernels]
+    
+    # Run all kernels distributed across GPUs
+    print(f"Running {len(all_configs)} benchmarks across {len(gpus_list)} GPUs...")
+    results = run_kernels_on_gpus(all_configs)
+    
+    # Process results
+    for success, result, _ in results:
+        if success and result is not None:
+            timers.append(result)
+    
+    return timers
+
+
+def bench_v2(dtype: torch.dtype, with_cuda_graph: Optional[int],
+             with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
+             sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
+                         sub_label)
+    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
new file mode 100644
index 0000000000000..82567a57b303a
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
@@ -0,0 +1,215 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from bench_v1 import bench_v1
+from bench_v2 import bench_v2
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    dtype = args.dtype
+
+    use_bench_v2 = args.with_cuda_graph or args.with_arg_pool
+    for m, k, n in MKNs:
+        if use_bench_v2:
+            label = f"scaled-sparse-{dtype}-gemm"
+            label = f"{label}-cugraph_{args.with_cuda_graph}" \
+                  if args.with_cuda_graph else label
+            label = f"{label}-argpool_{args.with_arg_pool}" \
+                if args.with_arg_pool else label
+            timers = bench_v2(args.dtype, args.with_cuda_graph,
+                              args.with_arg_pool, m, k, n, label,
+                              f"MKN=({m}x{k}x{n})")
+        else:
+            timers = bench_v1(args.dtype, m, k, n, f"scaled-sparse-{dtype}-gemm",
+                              f"MKN=({m}x{k}x{n})")
+
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            if tp_split_dim is not None:
+                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        if dt == "fp16":
+            return torch.float16
+        if dt == "bf16":
+            return torch.bfloat16
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
+    parser.add_argument(
+        '--with-cuda-graph',
+        type=int,
+        default=None,
+        help="Number of ops/matmuls in a cudagraph execution. When set"
+        "cuda-graphs is enabled")
+    parser.add_argument(
+        '--with-arg-pool',
+        type=int,
+        default=None,
+        help="Number of A and B tensors to use as arg-pool. When not set,"
+        "it defaults to 1")
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json b/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json
new file mode 100644
index 0000000000000..2cb53b60807a1
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json
@@ -0,0 +1 @@
+{"date": "2024-11-09T05:36:00.932166", "stable_kernels": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 537, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295]}
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
new file mode 100644
index 0000000000000..2d753b254a0ab
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
@@ -0,0 +1,87 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+    
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+    
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
+    
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    # # Initialize a to all ones
+    # a = torch.ones((m, k), device='cuda')
+    # # Initialize b to all ones
+    # b = torch.ones((n, k), device='cuda')
+
+    b = prune_to_2_4(b)
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_compress_entry(b)
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
new file mode 100644
index 0000000000000..2999244bf9b95
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
@@ -0,0 +1,75 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
+                                                              8192], None),
+                                      ([8192, 14336], None),
+                                      ([7168, 8192], None)],
+    # The shape space is very big when benchmarking a large set of kernels.
+    # For example: Let,
+    #  - #kernels to benchmark be 1700
+    #  - #models to benchmark be 4 (each model has 4 shapes)
+    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
+    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
+    # to run, then the benchmark suite would take,
+    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
+    # Below, we exploit some observation on the benchmark shapes to create a
+    # representative set.
+    #
+    # From previous benchmarking runs, we observe that perf if stratified as,
+    # N - small, medium, large and K - small and large. We also observe that
+    # in the model shapes, when K is small, we have small, medium and large Ns.
+    # when K is large, we only have small Ns.
+    #
+    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
+    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
+    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
+    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
+    #         22016, 27648, 28672]
+    "llama-representative-set": [
+        # ([4096, 4096], None),  # small K, small N
+        ([4096, 8192], None),  # small K, medium N
+        ([4096, 22016], None),  # small K, large N
+        ([14336, 4096], None),  # large K, small N
+        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
+    ],
+}
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 63cf5d50cac75..abcde3b016a7b 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -386,4 +386,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 25ec9d6028627..d58fb0bf86374 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -40,4 +40,4 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
-}
+}
\ No newline at end of file
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 25ec9d6028627..77f15891d84b2 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -40,4 +40,36 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
+    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
+                                                              8192], None),
+                                      ([8192, 14336], None),
+                                      ([7168, 8192], None)],
+    # The shape space is very big when benchmarking a large set of kernels.
+    # For example: Let,
+    #  - #kernels to benchmark be 1700
+    #  - #models to benchmark be 4 (each model has 4 shapes)
+    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
+    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
+    # to run, then the benchmark suite would take,
+    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
+    # Below, we exploit some observation on the benchmark shapes to create a
+    # representative set.
+    #
+    # From previous benchmarking runs, we observe that perf if stratified as,
+    # N - small, medium, large and K - small and large. We also observe that
+    # in the model shapes, when K is small, we have small, medium and large Ns.
+    # when K is large, we only have small Ns.
+    #
+    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
+    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
+    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
+    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
+    #         22016, 27648, 28672]
+    "llama-representative-set": [
+        ([4096, 4096], None),  # small K, small N
+        ([4096, 8192], None),  # small K, medium N
+        ([4096, 22016], None),  # small K, large N
+        ([14336, 4096], None),  # large K, small N
+        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
+    ],
 }
diff --git a/csrc/ops.h b/csrc/ops.h
index c50eb39a3dacc..3c9b814a456e8 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -142,6 +142,17 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability);
+
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& e,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+                                 torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 292c9e4b34e1c..84e1f367c8722 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,589 +1,7 @@
-// clang-format will break include orders
-// clang-format off
-#include <cudaTypedefs.h>
-
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
+#include <stddef.h>
 #include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
 #include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
-
-/*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
-*/
-
-namespace {
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-
-/*
- * This class provides the common load descriptors for the
- * ScaledEpilogue[...] classes
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  template <typename T>
-  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // This utility function constructs the arguments for the load descriptors
-  // from a tensor. It can handle both row and column, as well as row/column or
-  // scalar cases.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(torch::Tensor const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
-    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
-      return Arguments{data_ptr, tensor.numel() != 1};
-    } else {
-      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
-                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
-      return Arguments{data_ptr};
-    }
-  }
-
-  // This overload handles the case where there might not be a tensor, in which
-  // case a nullptr is passed and a constant (0) is used.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
-    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
-                  std::is_same_v<Descriptor, RowLoad<T, true>>);
-    return Arguments{data_ptr};
-  }
-};
-
-/*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch.scaled_mm_.
-
-   A and B may be both either int8 or fp8_e4m3. A can be
-   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
-  }
-};
-
-/*
- * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
- * This bias can also be used in the per-tensor azp case, where the activation
- * zero point (azp) is used to compute an azp correction term,
- * which is folded into the bias.
- *
- * The bias tensor must be per-output channel.
- * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
-  }
-};
-
-/*
- * This epilogue directly supports per-tensor azp in int32 form.
- * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
- * term, which should already be multiplied with the scalar azp.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // This is the full AZP term, azp * J @ B, shape (1,n)
-  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute float(accum - azp_adj), both operands are int32_t
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-/*
- * This epilogue supports per-token azp by computing and applying
- * the correction term using a rank-1 update. If the term were materialized,
- * it would require O(m*n) space, and this way it only requires O(m+n) space.
- * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
- * point for each row of A.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // Per-token azp term, shape (m,1)
-  using Azp = typename SUPER::template ColLoad<int32_t>;
-
-  // This is the AZP adjustment term, J @ B, shape (1,n)
-  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute azp * azp_adj
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, int32_t, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
-
-  // Compute float(accum - azp*azp_adj), all operands are int32_t
-  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAcc =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-}  // namespace
+#include "scaled_mm_c3x.cuh"
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
@@ -748,4 +166,75 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-#endif
+// hyper-parameter sweep kernels
+
+void cutlass_scaled_mm_sm90_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     c10::optional<torch::Tensor> const& bias) {
+  assert(!bias);
+
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using AccType = float;
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+        cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t,
+                        ScaledEpilogue, TileShape, ClusterShape, KernelSchedule,
+                        EpilogueSchedule, AccType,
+                        cutlass::gemm::PersistentScheduler,
+                        cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_gemm_caller<Cutlass3xGemm>(out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+
+    using Cutlass3xGemm =
+        cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::half_t, ScaledEpilogue,
+                        TileShape, ClusterShape, KernelSchedule,
+                        EpilogueSchedule, AccType,
+                        cutlass::gemm::PersistentScheduler,
+                        cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_gemm_caller<Cutlass3xGemm>(out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_simple_gemm_sm90_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b) {
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using AccType = float;
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+        cutlass_3x_simple_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t,
+                               TileShape, ClusterShape, KernelSchedule, AccType,
+                               cutlass::gemm::PersistentScheduler,
+                               cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+
+    using Cutlass3xGemm =
+        cutlass_3x_simple_gemm<cutlass::float_e4m3_t, cutlass::half_t,
+                               TileShape, ClusterShape, KernelSchedule, AccType,
+                               cutlass::gemm::PersistentScheduler,
+                               cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..9b1dd748bfbbe
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
@@ -0,0 +1,777 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+  #endif
+  }
+};
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzp
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzpToken
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, AccType,
+                                AccType>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
+                                      epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+using ReductionMode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::ReductionMode;
+using DecompositionMode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+using RasterOrderOptions = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions; 
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller_streamk(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                ReductionMode reduction_mode,
+                                DecompositionMode decomposition_mode,
+                                EpilogueArgs&&... epilogue_params) {
+
+  static_assert(std::is_same<typename Gemm::KernelType::TileSchedulerTag, cutlass::gemm::StreamKScheduler>::value, "Must be streamk scheduler");
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::TileSchedulerArguments tile_scheduler_args(
+    1,
+    1,
+    RasterOrderOptions::Heuristic,
+    decomposition_mode
+  );
+  tile_scheduler_args.reduction_mode = reduction_mode;
+
+  // Copied from examples...
+  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
+                                      epilogue_args, hw_info, tile_scheduler_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename ElementAB_, typename ElementD_, typename TileShape,
+          typename ClusterShape, typename KernelSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_3x_simple_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, AccType,
+                                AccType>::type;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+inline void cutlass_simple_gemm_caller(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
+                                      epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
new file mode 100644
index 0000000000000..660ee33044d9f
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -0,0 +1,203 @@
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/detail/dependent_false.hpp"
+
+#include "util/broadcast_load_epilogue_c3x.hpp"
+#include "util/common.hpp"
+
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "util/host_tensor.h"
+#include "util/packed_stride.hpp"
+
+#include "util/helper.h"
+
+#include "sparse_scaled_mm_c3x.cuh"
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+template<typename ElementA_>
+bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
+{
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 ||
+              a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 ||
+              a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1)
+
+  int m = a.size(0);
+  int k = a.size(1);
+
+  using ProblemShape = Shape<int,int,int,int>;
+  using ElementA = ElementA_;
+  using LayoutTagA = cutlass::layout::RowMajor;
+
+  // Layouts for reference (non-sparse) tensors
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  using StrideE = StrideA;
+
+  using Gemm =
+    typename std::conditional<std::is_same_v<ElementA, int8_t>,
+      typename sm90_int8_config_default<int8_t, cutlass::half_t,
+                                        ScaledEpilogue>::Cutlass3xGemm,
+      typename std::conditional<std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+        typename sm90_fp8_config_default<cutlass::float_e4m3_t, cutlass::half_t,
+                                          ScaledEpilogue>::Cutlass3xGemm,
+        typename std::conditional<std::is_same_v<ElementA, cutlass::half_t>,
+          typename sm90_fp16_config_default<cutlass::half_t, cutlass::half_t,
+                                            ScaledEpilogue>::Cutlass3xGemm,
+          typename sm90_bf16_config_default<cutlass::bfloat16_t,
+                                            cutlass::half_t,
+                                            ScaledEpilogue>::Cutlass3xGemm
+        >::type
+      >::type
+    >::type;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Just a dummy value
+  int32_t n = 128;
+
+  int64_t lda = a.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  // typename Gemm::GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+
+  // Offline compressor kernel
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig>;
+
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig,
+                            cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+  
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA stride_A;
+  StrideA stride_A_compressed;
+  StrideE stride_E;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+
+  CompressorUtility compressor_utility(prob_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+
+  // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+  // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
+  // cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
+
+  auto a_compressed_ptr = static_cast<ElementA*>(a_compressed.data_ptr());
+  auto e_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(e.data_ptr());
+
+  // block_A_compressed.reset(M * KC * L);
+  // block_E.reset(ME * KE * L);
+
+  stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
+  stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
+
+  // // Random sparsification is performed on host
+  // std::vector<ElementA> block_A_host(m * k);
+  // cutlass::device_memory::copy_to_host(block_A_host.data(), a_ptr, m * k);
+  // compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), 2024);
+  // cutlass::device_memory::copy_to_device(a_ptr, block_A_host.data(), m * k);
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  typename Compressor::Arguments arguments {
+    prob_shape,
+    { a_ptr,
+      stride_A,
+      a_compressed_ptr,
+      e_ptr },
+    {hw_info} };
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
+{
+  if (a.dtype() == torch::kBFloat16) {
+    return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return sparsify_and_compress<cutlass::half_t>(a_compressed, e, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return sparsify_and_compress<cutlass::float_e4m3_t>(a_compressed, e, a);
+  }
+  else if (a.dtype() == torch::kInt8) {
+    return sparsify_and_compress<int8_t>(a_compressed, e, a);
+  }
+  return false;
+}
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
new file mode 100644
index 0000000000000..a62598587b1b1
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -0,0 +1,285 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "util/broadcast_load_epilogue_c3x.hpp"
+#include "util/common.hpp"
+// clang-format on
+
+#include "sparse_scaled_mm_c3x.cuh"
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& e,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // m in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& e,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat16);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp16_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& e,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  TORCH_CHECK(a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kBFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_bf16_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& e,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& e,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+  else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+  else {
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kBFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& e,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBias>(
+        c, a, e, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogue>(c, a, e, b,
+                                                           a_scales,
+                                                           b_scales);
+  }
+}
+
+void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& e,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                c10::optional<torch::Tensor> const& azp,
+                                c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+        out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
+        out, a, e, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..0ff65ef199a91
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -0,0 +1,673 @@
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "util/broadcast_load_epilogue_c3x.hpp"
+#include "util/common.hpp"
+
+using namespace cute;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+  #endif
+  }
+};
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzp
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzpToken
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc = AccType;
+      // typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+      //                           float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 32, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& e, torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(e.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_layout,
+                                                       b_ptr, b_stride,
+                                                       e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp16_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_bf16_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
new file mode 100644
index 0000000000000..736d21dd03297
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -0,0 +1,82 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& e,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
+#endif
+
+bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability) {
+  // CUTLASS FP8 kernels need at least
+  //   CUDA 12.0 on SM90 systems (Hopper)
+  //   CUDA 12.4 on SM89 systems (Lovelace)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 89) {
+    return CUDA_VERSION >= 12040;
+  }
+#endif
+
+  return false;
+}
+
+int32_t test_get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
+
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& e,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) * 2 == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = test_get_sm_version_num();
+  // Hopper
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_scaled_sparse_mm_sm90(c, a, e, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp b/csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp
new file mode 100644
index 0000000000000..58b1e8ff159fb
--- /dev/null
+++ b/csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp
@@ -0,0 +1,447 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcast {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcast {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
+  }
+};
+
+}
diff --git a/csrc/sparse/cutlass/util/common.hpp b/csrc/sparse/cutlass/util/common.hpp
new file mode 100644
index 0000000000000..bf04bb400790f
--- /dev/null
+++ b/csrc/sparse/cutlass/util/common.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                        \
+  {                                                  \
+    TORCH_CHECK(status == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(status))      \
+  }
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
diff --git a/csrc/sparse/cutlass/util/device_memory.h b/csrc/sparse/cutlass/util/device_memory.h
new file mode 100644
index 0000000000000..7d3fa73f62df8
--- /dev/null
+++ b/csrc/sparse/cutlass/util/device_memory.h
@@ -0,0 +1,377 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ interface to CUDA device memory management functions.
+ */
+
+#include <memory>
+#include <sstream>
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/trace.h"
+#include "exceptions.h"
+
+namespace cutlass {
+namespace device_memory {
+
+/******************************************************************************
+ * Allocation lifetime
+ ******************************************************************************/
+
+/// Allocate a buffer of \p count elements of type \p T on the current CUDA device
+template <typename T>
+T* allocate(size_t count = 1) {
+
+  T* ptr = 0;
+  size_t bytes = 0;
+
+  bytes = count * sizeof(T);
+
+  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
+
+  if (cuda_error != cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+#endif
+    throw cuda_exception("Failed to allocate memory", cuda_error);
+  }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  else {
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+  }
+#endif
+
+  return ptr;
+}
+
+/// Free the buffer pointed to by \p ptr
+template <typename T>
+void free(T* ptr) {
+  if (ptr) {
+    cudaError_t cuda_error = (cudaFree(ptr));
+    if (cuda_error != cudaSuccess) {
+      throw cuda_exception("Failed to free device memory", cuda_error);
+    }
+  }
+}
+
+/******************************************************************************
+ * Data movement
+ ******************************************************************************/
+
+template <typename T>
+void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
+  size_t bytes = count * sizeof_bits<T>::value / 8;
+  if (bytes == 0 && count > 0) {
+    bytes = 1;
+  }
+  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
+  if (cuda_error != cudaSuccess) {
+    std::ostringstream os;
+    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
+       << "dst=" << dst << ", src=" << src
+       << ", bytes=" << bytes << ", count=" << count;
+    if (kind == cudaMemcpyHostToDevice) {
+      os << ", kind=cudaMemcpyHostToDevice";
+    }
+    else if (kind == cudaMemcpyDeviceToHost) {
+      os << ", kind=cudaMemcpyDeviceToHost";
+    }
+    else if (kind == cudaMemcpyDeviceToDevice) {
+      os << ", kind=cudaMemcpyDeviceToDevice";
+    }
+    else if (kind == cudaMemcpyHostToHost) {
+      os << ", kind=cudaMemcpyHostToHost";
+    }
+    else if (kind == cudaMemcpyDefault) {
+      os << ", kind=cudaMemcpyDefault";
+    }
+    else {
+      os << ", kind=Unknown";
+    }
+    os << ", error: " << cudaGetErrorString(cuda_error);
+
+    throw cuda_exception(os.str().c_str(), cuda_error);
+  }
+}
+
+template <typename T>
+void copy_to_device(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyHostToDevice);
+}
+
+template <typename T>
+void copy_to_host(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyDeviceToHost);
+}
+
+template <typename T>
+void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyDeviceToDevice);
+}
+
+template <typename T>
+void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyHostToHost);
+}
+
+/// Copies elements from device memory to host-side range
+template <typename OutputIterator, typename T>
+void insert_to_host(OutputIterator begin, OutputIterator end, T const* device_begin) {
+  size_t elements = end - begin;
+  copy_to_host(&*begin, device_begin, elements);
+}
+
+/// Copies elements to device memory from host-side range
+template <typename T, typename InputIterator>
+void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
+  size_t elements = end - begin;
+  copy_to_device(device_begin, &*begin, elements);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device_memory
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class DeviceAllocation {
+public:
+
+  /// Delete functor for CUDA device memory
+  struct deleter {
+    void operator()(T* ptr) {
+      cudaError_t cuda_error = (cudaFree(ptr));
+      if (cuda_error != cudaSuccess) {
+        // noexcept
+        //                throw cuda_exception("cudaFree() failed", cuda_error);
+        return;
+      }
+    }
+  };
+
+public:
+  //
+  // Data members
+  //
+
+  /// Number of elements of T allocated on the current CUDA device
+  size_t capacity;
+
+  /// Smart pointer
+  platform::unique_ptr<T, deleter> smart_ptr;
+
+public:
+
+  //
+  // Static methods
+  //
+
+  /// Static member to compute the number of bytes needed for a given number of elements
+  static size_t bytes(size_t elements) {
+    if (sizeof_bits<T>::value < 8) {
+      size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;
+      return elements / kElementsPerByte;
+    }
+    else {
+      size_t const kBytesPerElement = sizeof_bits<T>::value / 8;
+      return elements * kBytesPerElement;
+    }
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor: allocates no memory
+  DeviceAllocation() : capacity(0) {}
+
+  /// Constructor: allocates \p capacity elements on the current CUDA device
+  DeviceAllocation(size_t _capacity) : 
+    smart_ptr(device_memory::allocate<T>(_capacity)), capacity(_capacity) {}
+
+  /// Constructor: allocates \p capacity elements on the current CUDA device taking ownership of the allocation
+  DeviceAllocation(T *ptr, size_t _capacity) : smart_ptr(ptr), capacity(_capacity) {}
+
+  /// Copy constructor
+  DeviceAllocation(DeviceAllocation const &p): 
+    smart_ptr(device_memory::allocate<T>(p.capacity)), capacity(p.capacity) {
+
+    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
+  }
+
+  /// Move constructor
+  DeviceAllocation(DeviceAllocation &&p): capacity(0) {
+    std::swap(smart_ptr, p.smart_ptr);
+    std::swap(capacity, p.capacity);
+  }
+
+  /// Destructor
+  ~DeviceAllocation() { reset(); }
+
+  /// Returns a pointer to the managed object
+  T* get() const { return smart_ptr.get(); }
+
+  /// Releases the ownership of the managed object (without deleting) and resets capacity to zero
+  T* release() {
+    capacity = 0;
+    return smart_ptr.release();
+  }
+
+  /// Deletes the managed object and resets capacity to zero
+  void reset() {
+    capacity = 0;
+    smart_ptr.reset();
+  }
+
+  /// Deletes managed object, if owned, and allocates a new object
+  void reset(size_t _capacity) {
+    reset(device_memory::allocate<T>(_capacity), _capacity);
+  }
+
+  /// Deletes managed object, if owned, and replaces its reference with a given pointer and capacity
+  void reset(T* _ptr, size_t _capacity) {
+    smart_ptr.reset(_ptr);
+    capacity = _capacity;
+  }
+
+  /// Allocates a new buffer and copies the old buffer into it. The old buffer is then released.
+  void reallocate(size_t new_capacity) {
+    
+    platform::unique_ptr<T, deleter> new_allocation(device_memory::allocate<T>(new_capacity));
+
+    device_memory::copy_device_to_device(
+      new_allocation.get(), 
+      smart_ptr.get(), 
+      std::min(new_capacity, capacity));
+
+    std::swap(smart_ptr, new_allocation);
+    std::swap(new_capacity, capacity);
+  }
+
+  /// Returns the number of elements
+  size_t size() const {
+    return capacity;
+  }
+
+  /// Returns the number of bytes needed to store the allocation
+  size_t bytes() const {
+    return bytes(capacity);
+  }
+
+  /// Returns a pointer to the object owned by *this
+  T* operator->() const { return smart_ptr.get(); }
+
+  /// Returns the deleter object which would be used for destruction of the managed object.
+  deleter& get_deleter() { return smart_ptr.get_deleter(); }
+
+  /// Returns the deleter object which would be used for destruction of the managed object (const)
+  const deleter& get_deleter() const { return smart_ptr.get_deleter(); }
+
+  /// Copies a device-side memory allocation
+  DeviceAllocation & operator=(DeviceAllocation const &p) {
+    if (capacity != p.capacity) {
+      smart_ptr.reset(device_memory::allocate<T>(p.capacity));
+      capacity = p.capacity;
+    }
+    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
+    return *this;
+  }
+
+  /// Move assignment
+  DeviceAllocation & operator=(DeviceAllocation && p) {
+    std::swap(smart_ptr, p.smart_ptr);
+    std::swap(capacity, p.capacity);
+    return *this;
+  }
+
+  /// Copies the entire allocation from another location in device memory.
+  void copy_from_device(T const *ptr) const {
+    copy_from_device(ptr, capacity);
+  }
+
+  /// Copies a given number of elements from device memory
+  void copy_from_device(T const *ptr, size_t elements) const {
+    device_memory::copy_device_to_device(get(), ptr, elements);
+  }
+
+  void copy_to_device(T *ptr) const {
+    copy_to_device(ptr, capacity);
+  }
+
+  void copy_to_device(T *ptr, size_t elements) const {
+    device_memory::copy_device_to_device(ptr, get(), elements);
+  }
+
+  void copy_from_host(T const *ptr) const {
+    copy_from_host(ptr, capacity);
+  }
+
+  void copy_from_host(T const *ptr, size_t elements) const {
+    device_memory::copy_to_device(get(), ptr, elements);
+  }
+
+  void copy_to_host(T *ptr) const {
+    copy_to_host(ptr, capacity);
+  }
+
+  void copy_to_host(T *ptr, size_t elements) const {
+    device_memory::copy_to_host(ptr, get(), elements); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace device_memory {
+
+/// Device allocation abstraction that tracks size and capacity
+template <typename T>
+using allocation = cutlass::DeviceAllocation<T>;
+
+}  // namespace device_memory
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/util/exceptions.h b/csrc/sparse/cutlass/util/exceptions.h
new file mode 100644
index 0000000000000..54c62fdbb6f5d
--- /dev/null
+++ b/csrc/sparse/cutlass/util/exceptions.h
@@ -0,0 +1,69 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ exception semantics for CUDA error codes
+ */
+
+#include <cuda_runtime.h>
+#include <iosfwd>
+#include <stdexcept>
+
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+/// C++ exception wrapper for CUDA \p cudaError_t
+class cuda_exception : public std::exception {
+ public:
+  /// Constructor
+  cuda_exception(const char* msg = "", cudaError_t err = cudaErrorUnknown) : msg(msg), err(err) {}
+
+  /// Returns the underlying CUDA \p cudaError_t
+  cudaError_t cudaError() const { return err; }
+
+ protected:
+  /// Explanatory string
+  const char* msg;
+
+  /// Underlying CUDA \p cudaError_t
+  cudaError_t err;
+};
+
+/// Writes a cuda_exception instance to an output stream
+inline std::ostream& operator<<(std::ostream& out, cuda_exception const& e) {
+  return out << e.what() << ": " << cudaGetErrorString(e.cudaError());
+}
+
+}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/util/helper.h b/csrc/sparse/cutlass/util/helper.h
new file mode 100644
index 0000000000000..f333fab9cac53
--- /dev/null
+++ b/csrc/sparse/cutlass/util/helper.h
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                              \
+  {                                                                     \
+    cudaError_t error = status;                                         \
+    if (error != cudaSuccess) {                                         \
+      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                << " at line: " << __LINE__ << std::endl;               \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  }
+
+
+/**
+ * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
+ */
+struct GpuTimer
+{
+    cudaStream_t _stream_id;
+    cudaEvent_t _start;
+    cudaEvent_t _stop;
+
+    /// Constructor
+    GpuTimer() : _stream_id(0)
+    {
+        CUDA_CHECK(cudaEventCreate(&_start));
+        CUDA_CHECK(cudaEventCreate(&_stop));
+    }
+
+    /// Destructor
+    ~GpuTimer()
+    {
+        CUDA_CHECK(cudaEventDestroy(_start));
+        CUDA_CHECK(cudaEventDestroy(_stop));
+    }
+
+    /// Start the timer for a given stream (defaults to the default stream)
+    void start(cudaStream_t stream_id = 0)
+    {
+        _stream_id = stream_id;
+        CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+    }
+
+    /// Stop the timer
+    void stop()
+    {
+        CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
+    }
+
+    /// Return the elapsed time (in milliseconds)
+    float elapsed_millis()
+    {
+        float elapsed = 0.0;
+        CUDA_CHECK(cudaEventSynchronize(_stop));
+        CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
+        return elapsed;
+    }
+};
diff --git a/csrc/sparse/cutlass/util/host_tensor.h b/csrc/sparse/cutlass/util/host_tensor.h
new file mode 100644
index 0000000000000..3f061875b48dc
--- /dev/null
+++ b/csrc/sparse/cutlass/util/host_tensor.h
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+  \brief HostTensor contributes management for both host and device memory.
+
+  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
+  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
+  for CUDA memcpy operations.
+
+  Call {host, device}_{data, ref, view}() for accessing host or device memory.
+
+  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
+*/
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/fast_math.h"
+
+#include "device_memory.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Host tensor
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class HostTensor {
+public:
+
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+
+  /// Tensor reference to device memory
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Tensor reference to constant device memory
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  /// Tensor reference to device memory
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Tensor reference to constant device memory
+  using ConstTensorView = typename TensorView::ConstTensorView;
+
+  /// Reference to element in tensor
+  using Reference = typename TensorRef::Reference;
+
+  /// Constant reference to element in tensor
+  using ConstReference = typename ConstTensorRef::Reference;
+
+private:
+  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
+                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
+                                      Element, uint8_t>>;
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
+  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
+  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
+  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+
+  //
+  // Data members
+  //
+
+  /// Extent of tensor in logical dimensions
+  TensorCoord extent_;
+
+  /// Layout object
+  Layout layout_;
+
+  /// Host-side memory allocation
+  std::vector<StorageUnit> host_;
+
+  /// Device-side memory
+  device_memory::allocation<StorageUnit> device_;
+
+  /// number of containers 
+  size_t count_to_container_storage_unit_count(size_t count) {
+    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
+  }
+
+public:
+  //
+  // Device and Host Methods
+  //
+
+  /// Default constructor
+  HostTensor() {}
+
+  /// Constructs a tensor given an extent. Assumes a packed layout
+  HostTensor(
+    TensorCoord const &extent,
+    bool device_backed = true
+  ) {
+
+    this->reset(extent, Layout::packed(extent), device_backed);
+  }
+
+  /// Constructs a tensor given an extent and layout
+  HostTensor(
+    TensorCoord const &extent,
+    Layout const &layout,
+    bool device_backed = true
+  ) {
+
+    this->reset(extent, layout, device_backed);
+  }
+
+  ~HostTensor() { }
+
+  /// Clears the HostTensor allocation to size/capacity = 0
+  void reset() {
+    extent_ = TensorCoord();
+    layout_ = Layout::packed(extent_);
+
+    host_.clear();
+    device_.reset();
+  }
+
+  /// Resizes internal memory allocations without affecting layout or extent
+  void reserve(
+    size_t count,                                        ///< size of tensor in elements
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
+#endif
+
+    device_.reset();
+    host_.clear();
+
+    size_t count_container = count_to_container_storage_unit_count(count);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
+#endif    
+    host_.resize(count_container);
+
+    // Allocate memory
+    StorageUnit* device_memory = nullptr;
+    if (device_backed_) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
+#endif
+      device_memory = device_memory::allocate<StorageUnit>(count_container);
+    }
+    device_.reset(device_memory, device_backed_ ? count_container : 0);
+  }
+
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    extent_ = extent;
+    layout_ = layout;
+
+    reserve(size_t(layout_.capacity(extent_)), device_backed_);
+  }
+
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout. Assumes a packed tensor configuration.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    reset(extent, Layout::packed(extent), device_backed_);
+  }
+
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset().
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    extent_ = extent;
+    layout_ = layout;
+
+    LongIndex new_size = size_t(layout_.capacity(extent_));
+    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
+
+    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
+      reserve(new_size, device_backed_);
+    }
+  }
+
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    resize(extent, Layout::packed(extent), device_backed_);
+  }
+
+  /// Returns the logical number of elements stored in the host tensor
+  size_t size() const {
+    return layout_.capacity(extent_);
+  }
+
+  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
+  LongIndex capacity() const {
+    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
+  }
+
+  /// Gets pointer to host data
+  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
+
+  /// Gets pointer to host data with a pointer offset
+  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+
+  /// Gets a reference to an element in host memory
+  Reference host_data(LongIndex idx) {
+    return ReferenceFactory<Element>::get(host_data(), idx);
+  }
+
+  /// Gets pointer to host data
+  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
+
+  /// Gets pointer to host data with a pointer offset
+  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+
+  /// Gets a constant reference to an element in host memory
+  ConstReference host_data(LongIndex idx) const {
+    return ReferenceFactory<Element const>::get(host_data(), idx);
+  }
+
+  /// Gets pointer to device data
+  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
+
+  /// Gets pointer to device data
+  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
+
+  /// Gets pointer to device data with a pointer offset
+  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+
+  /// Gets pointer to device data with a pointer offset
+  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+
+  /// Accesses the tensor reference pointing to data
+  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+
+  /// Accesses the tensor reference pointing to data
+  TensorRef device_ref(LongIndex ptr_element_offset=0) {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  TensorView host_view(LongIndex ptr_element_offset=0) {
+    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  TensorView device_view(LongIndex ptr_element_offset=0) {
+    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Returns true if device memory is allocated
+  bool device_backed() const {
+    return (device_.get() == nullptr) ? false : true;
+  }
+
+
+  /// Returns the layout object
+  Layout & layout() {
+    return layout_;
+  }
+
+  /// Returns the layout object
+  Layout layout() const {
+    return layout_;
+  }
+
+  /// Returns the layout object's stride vector
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride vector
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+
+  /// Computes the offset of an index from the origin of the tensor
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+
+  /// Returns a reference to the element at the logical Coord in host memory
+  Reference at(TensorCoord const& coord) {
+    return host_data(offset(coord));
+  }
+
+  /// Returns a const reference to the element at the logical Coord in host memory
+  ConstReference at(TensorCoord const& coord) const {
+    return host_data(offset(coord));
+  }
+
+  /// Returns the extent of the tensor
+  TensorCoord extent() const {
+    return extent_;
+  }
+
+  /// Returns the extent of the tensor
+  TensorCoord & extent() {
+    return extent_;
+  }
+
+  /// Copies data from device to host
+  void sync_host() {
+    if (device_backed()) {
+      device_memory::copy_to_host(
+          host_.data(), device_.get(), device_.size());
+    }
+  }
+
+  /// Copies data from host to device
+  void sync_device() {
+    if (device_backed()) {
+      device_memory::copy_to_device(
+          device_.get(), host_.data(), host_.size());
+    }
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_host(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_device(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_device(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_host(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_host(
+    Element * ptr_host,               ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_device(
+    Element * ptr_device,             ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_device(
+    Element * ptr_device,             ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_host(
+    Element * ptr_host,               ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/util/packed_stride.hpp b/csrc/sparse/cutlass/util/packed_stride.hpp
new file mode 100644
index 0000000000000..e9a243a1322cc
--- /dev/null
+++ b/csrc/sparse/cutlass/util/packed_stride.hpp
@@ -0,0 +1,570 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/container/array.hpp"   // cute::array
+#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides without batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, int64_t>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT, int64_t>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with group mode
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides for convolutions
+
+// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
+// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
+// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
+// right in KTRSC order and can be coalesced to just k.
+// We enforce this condition here with asserts.
+template <class IntT, size_t RankT_>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
+    cute::array<int32_t, RankT_> shape_output,
+    cute::array<IntT, RankT_> stride_output,
+    cutlass::conv::Operator conv_op) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  static_assert(RankT_ >= 3u);
+  constexpr static int RankT = static_cast<int>(RankT_);
+
+  assert(stride_output[RankT-1] == 1);
+  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
+    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
+  });
+
+  auto s_copy = s;
+  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
+      stride_output[0] :
+      stride_output[RankT-2];
+  return s_copy;
+}
+
+//
+// Activation tensor ((w, h, d, n), _1) for fprop kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_nwc[1];
+  cute::get<0,1>(s_copy) = stride_nwc[0];
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
+  });
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
+  });
+  return s_copy;
+}
+
+//
+// Filter tensor (k, (_1, s, r, t)) for fprop kernel
+//
+
+// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+//
+// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
+//
+// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
+// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::get<1,0>(s_copy) = stride_nwc[1];
+    cute::get<1,1>(s_copy) = stride_nwc[0];
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nwc in dgrad is ksc.
+    cute::get<1,0>(s_copy) = stride_nwc[0];
+    cute::get<1,1>(s_copy) = stride_nwc[1];
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nhwc in dgrad is krsc.
+    cute::get<1,0>(s_copy) = stride_nhwc[0];
+    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_ndhwc in dgrad is ktrsc.
+    cute::get<1,0>(s_copy) = stride_ndhwc[0];
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+//
+// NZPQ tensor (_1, nzpq) for wgrad kernel
+//
+
+// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 3> stride_nqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nqk[2] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nqk[1];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 4> stride_npqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_npqk[3] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_npqk[2];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 5> stride_nzpqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nzpqk[4] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nzpqk[3];
+  return s_copy;
+}
+
+
+
+//
+// Wgrad output tensor (k, (_1, s, r, t), _0)
+//
+
+// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+
+//
+// Wgrad output tensor ((_1, s, r, t), k, _0)
+//
+
+// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ksc[0];
+  cute::get<0,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b8185c24d5628..d75f29c7ff246 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -271,6 +271,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Test
+  ops.def(
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
+      "                  Tensor e,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
+
+  // Test
+  ops.def("cutlass_scaled_sparse_mm_supports_fp8(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_sparse_mm_supports_fp8", &cutlass_scaled_sparse_mm_supports_fp8);
+
+  // Test
+  ops.def("cutlass_compress_entry(Tensor! a_compressed, Tensor! e,"
+          " Tensor a) -> bool");
+  ops.impl("cutlass_compress_entry", &cutlass_compress_entry);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 27ca8ca5dbc58..8e47a850fc029 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.4.0+cpu; platform_machine != "ppc64le"
+torch == 2.5.0+cpu; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
diff --git a/sane_cute_errors.py b/sane_cute_errors.py
new file mode 100644
index 0000000000000..d742fe1fbc827
--- /dev/null
+++ b/sane_cute_errors.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+
+# Usage:
+#  ```
+#  python setup.py build_ext --inplace | tee compile_log.txt
+#  cat compile_log.txt | python sane_cute_errors.py
+#  ```
+
+import sys
+import regex
+from colorama import Fore
+
+def _loop_replace(replace_fn, input_str, *args, **kwargs):
+    new_string, count = replace_fn(input_str, *args, **kwargs)
+    while count > 0:
+        new_string, count = replace_fn(new_string, *args, **kwargs)
+    return new_string
+
+def replace_delimited_substring(input_str, start_delim, end_delim, replace_fn, prefix=""):    
+    start_delim = regex.escape(start_delim)
+    end_delim = regex.escape(end_delim)
+    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'
+    return regex.subn(rx, lambda x: replace_fn(x.group(2)), input_str)
+
+def replace_all_delimited_substrings(input_str, start_delim, end_delim, replace_fn, prefix=""):
+    return _loop_replace(replace_delimited_substring, input_str, start_delim, end_delim, replace_fn, prefix=prefix)
+
+def replace_delimiters(input_str, start_delim, end_delim, new_start, new_end, prefix=""):
+    start_delim = regex.escape(start_delim)
+    end_delim = regex.escape(end_delim)
+    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'
+    return regex.subn(rx, f"{new_start}\\2{new_end}", input_str)
+
+def replace_all_delimiters(input_str, start_delim, end_delim, new_start, new_end, prefix=""):
+    return _loop_replace(replace_delimiters, input_str, start_delim, end_delim, new_start, new_end, prefix=prefix)
+
+def replace(input_str, old, new):
+    return regex.subn(old, new, input_str)
+
+def replace_all(input_str, old, new):
+    return _loop_replace(replace, input_str, old, new)
+    
+def sepreate_at_line_of(input_str):
+    return regex.sub(r"at line (\d+) of ([^\n\r]*)", f"\n\t\tat {Fore.GREEN}\\2:\\1{Fore.RESET}", input_str)
+
+def break_apart_instantiation_of(input_str):
+    def replace_fn(x):
+        def replace_fn_inner(x):
+            x = regex.sub(r"([^\s=]+=)", r"\n\t\t  \1", x)
+            return x
+        x = regex.sub(r"(at line)", r"\n\t\t\1", x)
+        y, _ = replace_delimited_substring(x, "[", "]", replace_fn_inner)
+        return "instantiation of " + regex.sub(r"([^(]*)", f"{Fore.MAGENTA}\\1{Fore.RESET}", y, count=1)
+    
+    return replace_all_delimited_substrings(input_str, "\"", "\"", replace_fn, prefix=r"instantiation of ")
+
+def template_replace_commas_at_depth_0(x, new_char):
+    brace_stack = []
+    brace_pairs = { "(": ")", "[": "]", "{": "}", "<": ">" }
+    replaced_comma = False
+    
+    for idx in range(len(x)):
+        if x[idx] in brace_pairs:
+            brace_stack.append(x[idx])
+        elif len(brace_stack) > 0 and x[idx] == brace_pairs[brace_stack[-1]]:
+            brace_stack.pop()
+        if len(brace_stack) == 0 and x[idx] == ",":
+            x = x[:idx] + new_char + x[idx+1:]
+            replaced_comma = True
+    return x, replaced_comma
+
+
+def replace_layout_commas(x):
+    def replace_commas_inner(x):
+        x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
+        if not replaced:
+            x, _ = replace_delimiters(x, "<", ">", "", "", prefix="cute::tuple")
+            x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
+        assert replaced == True
+        return f"{Fore.BLUE}{x}{Fore.RESET}"
+    
+    x, _ = replace_delimited_substring(x, "<", ">", replace_commas_inner, prefix="cute::Layout")
+    return x
+
+def replace_composed_layout_commas(x):
+    def replace_commas_inner(x):
+        x, replaced = template_replace_commas_at_depth_0(x, new_char=" o")
+        assert replaced == True
+        return x
+    
+    x, _ = replace_delimited_substring(x, "<", ">", replace_commas_inner, prefix="cute::ComposedLayout")
+    return x
+
+def clean_up_log(log):
+    new_str = sepreate_at_line_of(log)
+    new_str = break_apart_instantiation_of(new_str)
+    new_str = replace_layout_commas(new_str)
+    new_str = replace_composed_layout_commas(new_str)
+    new_str = replace_all_delimiters(new_str, "<", ">", "(", ")", prefix="cute::tuple")
+    new_str = replace_all_delimiters(new_str, "<", ">", "S<", ">", prefix="cute::Swizzle")
+    new_str = replace_all(new_str, r"cute::C<(\d+)>", r"_\1")
+    new_str = replace_all(new_str, r"cute::_(\d+)", r"_\1")
+    new_str = replace_all(new_str, r"cute::Underscore", r"_")
+
+    template_type_abbreviations = (
+        ("cute::ScaledBasis", "SB"),
+        ("cute::Tensor", "T"),
+        ("cute::ArithmeticTuple", "AT"),
+        ("cute::ArithmeticTupleIterator", "ATI"),
+        ("cute::ViewEngine", "VE")
+    )
+
+    for template_type, abrv in template_type_abbreviations:
+        new_str = replace_all(new_str, template_type + "<", abrv + "<")
+    print(new_str)
+
+
+data = sys.stdin.read()
+clean_up_log(data)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 8abeb0ba739db..bffa282df2d2c 100644
--- a/setup.py
+++ b/setup.py
@@ -470,6 +470,16 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(
         CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
 
+# if _is_cuda():
+#     sparse_mm_generated_dir = './csrc/sparse/cutlass/generator/generated/'
+#     sparse_mm_generated_dirs = \
+#         [x for x in Path(sparse_mm_generated_dir).iterdir() if x.is_dir()]
+#     sparse_mm_generated_dir_names = [x.name for x in sparse_mm_generated_dirs]
+#     nm_cutlass_extensions = \
+#         [f"vllm._nm_cutlass_{x}_C" for x in sparse_mm_generated_dir_names]
+#     for x in nm_cutlass_extensions:
+#         ext_modules.append(CMakeExtension(name=x))
+
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..e107630979250
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,293 @@
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
+    compress_to_torch_sparse_semi_structured_mat,
+    decompress_torch_sparse_semi_structured_mat, dense_matmul,
+    generate_pruned_semi_structured_mat, get_random_mat,
+    is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
+    semi_structured_dense_sparse_T_gemm_scaled,
+    semi_structured_sparse_dense_gemm,
+    semi_structured_sparse_dense_gemm_scaled,
+    clear_cache)
+
+DTYPES = [torch.float16, torch.bfloat16, torch.int8]
+SIZES = [(128, 128), (1024, 8192)]
+SIZES_FP8 = [(32, 64), (1024, 1024)]
+MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
+
+
+# From pytorch test
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    # Calculate the scale as dtype max divided by absmax
+    scale = finfo.max / x.abs().max().clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("size", SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_semi_structured_compress(size, dtype):
+    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
+    output_pruned = decompress_torch_sparse_semi_structured_mat(
+        compress_to_torch_sparse_semi_structured_mat(input_pruned))
+    torch.testing.assert_close(input_pruned, output_pruned)
+
+
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("size", SIZES_FP8)
+def test_semi_structured_fp8_compress(size):
+    dtype = torch.float8_e4m3fn
+    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
+    output_pruned = decompress_torch_sparse_semi_structured_mat(
+        compress_to_torch_sparse_semi_structured_mat(input_pruned))
+    torch.testing.assert_close(input_pruned.to(torch.float32),
+                               output_pruned.to(torch.float32),
+                               rtol=1e-1,
+                               atol=1e-1)
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
+    M, N, K = mnk
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(K, N, dtype)
+    if dtype is torch.int8:
+        with pytest.raises(ValueError):
+            C_sparse = semi_structured_sparse_dense_gemm(A, B)
+    else:
+        C_sparse = semi_structured_sparse_dense_gemm(A, B)
+        C = dense_matmul(A_pruned, B, dtype)
+        torch.testing.assert_close(C, C_sparse)
+
+        # Verify cache
+        B = get_random_mat(K, N, dtype)
+        C = dense_matmul(A_pruned, B, dtype)
+        C_sparse = semi_structured_sparse_dense_gemm(A, B)
+        torch.testing.assert_close(C, C_sparse)
+
+        C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False)
+        torch.testing.assert_close(C, C_sparse)
+        clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
+    M, N, K = mnk
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(N, K, dtype)
+
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
+    C = dense_matmul(A_pruned, B.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
+
+    # Verify cache
+    B = get_random_mat(N, K, dtype)
+    C = dense_matmul(A_pruned, B.t(), dtype)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
+    torch.testing.assert_close(C, C_sparse)
+
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t(), cached=False)
+    torch.testing.assert_close(C, C_sparse)
+    clear_cache()
+
+
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    # Cached version
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    # Noncached version
+    C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False).to(
+        torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
+    M, N, K = mnk
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype)
+    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
+    A = get_random_mat(M, K, dtype)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T)
+    C = dense_matmul(A, B_T_pruned.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T, cached=False)
+    C = dense_matmul(A, B_T_pruned.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
+    clear_cache()
+
+
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=dtype)
+    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
+    A = torch.full((M, K), .25, device='cuda', dtype=dtype)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
+    A_pruned_fp8, scale_A = to_float8(A_pruned)
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+    # cached
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+    # noncached
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B,
+                                                        cached=False).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_dense_sparse_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A = torch.rand((M, K), device='cuda', dtype=torch.float16)
+    A_fp8, scale_a = to_float8(A)
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=torch.float16)
+    B_T_pruned_fp8, scale_b = to_float8(B_T_pruned)
+    B_T_packed = compress_to_torch_sparse_semi_structured_mat(B_T_pruned_fp8)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm_scaled(A_fp8,
+                                                          B_T_packed,
+                                                          scale_a=scale_a,
+                                                          scale_b=scale_b).to(
+                                                              torch.float32)
+    C = torch._scaled_mm(B_T_pruned_fp8,
+                         A_fp8.t(),
+                         scale_a=scale_b,
+                         scale_b=scale_a,
+                         out_dtype=torch.float32).t()
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul():
+    dtype = torch.int8
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(N, K, dtype)
+
+    scale_a = torch.tensor(2.0, dtype=torch.float32, device='cuda')
+    scale_b = torch.tensor(2.0, dtype=torch.float32, device='cuda')
+
+    C = dense_matmul(A_pruned,
+                     B.t(),
+                     dtype=dtype,
+                     scale_a=scale_a,
+                     scale_b=scale_b).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A,
+                                                        B.t(),
+                                                        scale_a=scale_a,
+                                                        scale_b=scale_b).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 46a2fb8bc80a2..6c0c8249e7557 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -515,6 +515,51 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability)
+
+
+def cutlass_compress_entry(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn or \
+            a.dtype is torch.bfloat16 or a.dtype is torch.float16)
+
+    # Not exactly sure what the right value would be based on cutlass definitions
+    # Let's assume e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
+    elemsPerElemE = 4
+
+    m = a.shape[0]
+    k = a.shape[1]
+    a_compressed = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    e = torch.empty((m, k // 2 // elemsPerElemE), dtype=torch.uint8, device=a.device)
+
+    if not (torch.ops._C.cutlass_compress_entry(a_compressed, e, a)):
+        raise ValueError
+
+    return a_compressed, e
+
+
+def cutlass_scaled_sparse_mm(a: torch.Tensor,
+                      e: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
+                      scale_b: torch.Tensor,
+                      out_dtype: torch.dtype,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    # assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == b.shape[
+        1] and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = b.shape[1]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, e, b, scale_a, scale_b, bias)
+
+    return out
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
@@ -644,6 +689,7 @@ def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     num_token_padding: Optional[int] = None,
+    pad_to_multiple: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -676,7 +722,15 @@ def scaled_fp8_quant(
     # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
     out_dtype: torch.dtype = torch.float8_e4m3fnuz \
             if current_platform.is_rocm() else torch.float8_e4m3fn
+    if pad_to_multiple:
+        assert not num_token_padding
+        remainder = input.shape[0] % pad_to_multiple
+        delta = 0
+        if remainder > 0:
+            delta = pad_to_multiple - remainder
+        shape = (input.shape[0] + delta, input.shape[1])
     if num_token_padding:
+        assert not pad_to_multiple
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     output = torch.empty(shape, device=input.device, dtype=out_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ecc345f116c37..5e3b96d15e85e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -18,12 +18,14 @@
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, CompressedTensors24)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
+from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.config import SparsityCompressionConfig, SparsityStructure
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
@@ -34,13 +36,18 @@ def __init__(self,
                  target_scheme_map: Dict[str, Any],
                  ignore: List[str],
                  quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+                 kv_cache_scheme: Optional[Dict[str, Any]] = None,
+                 model_compressor: Optional[ModelCompressor] = None,
+                 sparsity_scheme_map: Optional[Dict[str, Any]] = None
+                 ):
 
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
+        self.model_compressor = model_compressor
+        self.sparsity_scheme_map = sparsity_scheme_map
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -48,7 +55,7 @@ def get_linear_method(self) -> "CompressedTensorsLinearMethod":
     def get_scaled_act_names(self) -> List[str]:
         return []
 
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.float16, torch.bfloat16]
 
     @classmethod
@@ -81,10 +88,35 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
-        target_scheme_map: Dict[str, Any] = dict()
         ignore = cast(List[str], config.get("ignore"))
         quant_format = cast(str, config.get("format"))
 
+        target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
+        model_compressor = ModelCompressor.from_compression_config(config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(sparsity_config=model_compressor.sparsity_config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            model_compressor=model_compressor,
+            sparsity_scheme_map=sparsity_scheme_map,
+        )
+
+    @classmethod
+    def _sparsity_scheme_map_from_config(cls, sparsity_config: SparsityCompressionConfig):
+        sparse_targets = cast(List[str], sparsity_config.targets) if sparsity_config else []
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparse_targets
+        }
+        return sparse_scheme_map
+
+    @classmethod
+    def _quantization_scheme_map_from_config(cls, config: Dict[str, Any]):
+        target_scheme_map: Dict[str, Any] = dict()
+        quant_format = cast(str, config.get("format"))
+
         # The quant_config has multiple config_groups, each containing
         # an input_activations key with details about how the activations are
         # quantized, a weights key indicating how the weights are quantized,
@@ -93,12 +125,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for _, quant_config in config["config_groups"].items():
+        
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
                 target_scheme_map[target][
-                    "weights"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -113,13 +147,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                             "weights"].type == QuantizationType.FLOAT
                     else:
                         target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.parse_obj(
+                            "input_activations"] = QuantizationArgs.model_validate(
                                 quant_config.get("input_activations"))
-
-        return cls(target_scheme_map=target_scheme_map,
-                   ignore=ignore,
-                   quant_format=quant_format,
-                   kv_cache_scheme=config.get("kv_cache_scheme"))
+        return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -318,23 +348,94 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
+        
         matched_target = find_matched_target(
             layer_name=layer_name,
             module=layer,
             targets=self.target_scheme_map.keys())
 
-        # Find the quant_scheme
         scheme_dict = self.target_scheme_map[matched_target]
-        scheme = self._get_scheme_from_parts(
-            weight_quant=scheme_dict["weights"],
-            input_quant=scheme_dict["input_activations"])
+        weight_quant = scheme_dict["weights"]
+        input_quant = scheme_dict["input_activations"]
+        sparsity_scheme: Optional[SparsityCompressionConfig] = self.sparsity_scheme_map.get(matched_target)    
+
+        if self.supports_cutlass_24(
+            weight_quant=weight_quant,
+            input_quant=input_quant,
+            sparsity_scheme=sparsity_scheme
+        ):
+            # Have a valid sparsity scheme and the layer is supported by the Cutlass 2:4 Kernel
+            needs_decompression = sparsity_scheme.format != CompressionFormat.dense.value
+            is_quantized = weight_quant is not None or input_quant is not None 
+            
+            scheme = CompressedTensors24(
+                model_compressor=self.model_compressor,
+                layer_name=layer_name,
+                quantized=is_quantized,
+                do_decompress=needs_decompression,
+            )
+        else:
+        # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
-
         return scheme
+    
+    @staticmethod
+    def supports_cutlass_24(
+        weight_quant: Optional[QuantizationArgs], 
+        input_quant: Optional[QuantizationArgs], 
+        sparsity_scheme: Optional[SparsityCompressionConfig]=None
+        ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        
+        if (
+            sparsity_scheme is None or
+            sparsity_scheme.sparsity_structure != SparsityStructure.TWO_FOUR.value
+        ):
+            return False
+        
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+        
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+        
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
 
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+        
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.TOKEN.value
+        ]
+        
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+        
+        return True
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 5d259ec72051c..369466bbe68ef 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -6,6 +6,9 @@
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
+from .compressed_tensors_24 import (
+    CompressedTensors24
+)
 
 __all__ = [
     "CompressedTensorsScheme",
@@ -16,4 +19,5 @@
     "CompressedTensorsW8A8Fp8",
     "WNA16_SUPPORTED_BITS",
     "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000000000..5b6b84fd5380a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,204 @@
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter
+import torch
+from typing import List, Callable, Optional
+from compressed_tensors.compressors import ModelCompressor
+from torch.nn import Parameter
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import sparse_semi_structured_to_dense_cutlass, sparse_semi_structured_from_dense_cutlass
+from vllm import _custom_ops as ops
+from typing import Tuple
+
+__all__ = ["CompressedTensors24"]
+
+class CompressedTensors24(CompressedTensorsScheme):
+    def __init__(
+            self, 
+            model_compressor: Optional[ModelCompressor] = None, 
+            layer_name: Optional[str] = None,
+            quantized: bool = False,
+            do_decompress: bool = False,
+            ):
+        
+        self.model_compressor = model_compressor
+        self.layer_name = layer_name
+        self.quantized = quantized
+        self.do_decompress = do_decompress
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """
+        Since this scheme uses the cutlass library with FP8, it requires
+        a minimum capability of 90
+
+        :return: The minimum capability required for this scheme
+        """
+        return 90
+
+    def create_weights(
+            self, 
+            layer: torch.nn.Module, 
+            input_size: int,
+            output_partition_sizes: List[int],
+            input_size_per_partition: int,
+            params_dtype: torch.dtype, weight_loader: Callable,
+            **kwargs
+            ):
+        layer.logical_widths = output_partition_sizes
+        self.params_dtype=params_dtype
+
+        # parameter to store uncompressed weight or decompressed weight
+        weight = ModelWeightParameter(
+            data=torch.empty(sum(output_partition_sizes),
+                             input_size_per_partition,
+                             dtype=params_dtype),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader)
+        
+        if self.do_decompress:
+            # store compression specific things to be used
+            # later during decompression
+
+            bits_per_weight_element = weight.itemsize * 8 
+            meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
+
+            # compressed weight for 2:4 sparse
+            weight_packed = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // 2,
+                dtype=params_dtype),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader
+                )
+
+            meta_input_size = (
+                input_size_per_partition // 32
+                if bits_per_weight_element == 8
+                else input_size_per_partition // 16
+            )
+            # meta tensor for 2:4 decompression
+            meta = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes), 
+                meta_input_size,
+                dtype=meta_dtype),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader)
+
+            layer.register_parameter("weight_packed", weight_packed)
+            layer.register_parameter("meta", meta)
+
+        if self.quantized:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                dtype=torch.float32),
+                                output_dim=0,
+                                weight_loader=weight_loader)
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+        layer.register_parameter("weight", weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Apply any transformations to the weights after loading
+        them from disk
+
+        :param layer: The layer with the weights to be processed
+        """
+
+        decompressed_weight = (
+            layer.weight if not self.do_decompress
+            else self._decompress_24_weight(layer.weight_packed.data, layer.meta.data)
+        )
+        w_compressed, meta = ops.cutlass_compress_entry(decompressed_weight)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+        
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4 
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed 
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer 
+        """
+
+        PAD_MULTIPLE = 16
+        remainder = x.shape[0] % 16
+        pad_size = PAD_MULTIPLE - remainder if remainder > 0 else 0
+
+        q_input, input_scale = ops.scaled_fp8_quant(
+            x, pad_to_multiple=PAD_MULTIPLE, use_per_token_if_dynamic=True)
+
+        out = ops.cutlass_scaled_sparse_mm(
+            a=layer.weight,
+            e=layer.meta,
+            b=q_input.t(),
+            scale_a=layer.weight_scale,
+            scale_b=input_scale,
+            out_dtype=self.params_dtype,
+            bias=bias
+        )
+
+        out = out.t()
+        if pad_size > 0:
+            out = out[:-pad_size,:].contiguous()
+        else:
+            out = out.contiguous()
+        
+        return out
+    
+
+    def _decompress_24_weight(self, weight_packed: torch.Tensor, meta: torch.Tensor) -> torch.Tensor:
+        qkv_sizes = [2048, 256, 256]
+        gate_up_sizes = [5632, 5632]
+        split_weights = None 
+        split_meta = None
+
+        def _process_split(input_weight, input_meta):
+            weight_data = {
+                "weight_packed": input_weight,
+                "meta": input_meta
+            }
+            decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
+            return decompress
+
+        print(self.layer_name)
+        if "qkv" in self.layer_name:
+            split_weights = torch.split(weight_packed, qkv_sizes)
+            split_meta = torch.split(meta, qkv_sizes)
+        elif "gate_up" in self.layer_name:
+            split_weights = torch.split(weight_packed, gate_up_sizes)
+            split_meta = torch.split(meta, gate_up_sizes)
+
+        if split_weights:
+            all_compress = []
+            for i in range(len(split_weights)):
+                print(split_weights[i].shape, split_meta[i].shape)
+                compress_i = _process_split(split_weights[i], split_meta[i])
+                all_compress.append(compress_i)
+
+            decompressed = torch.cat(all_compress)
+        else:
+            decompressed = _process_split(weight_packed, meta)
+
+        return decompressed
+
+
+
+def check_24(tensor):
+    new_tensor = tensor.view(-1, 4)    
+    zero_counts = (new_tensor == 0).sum(dim=1)
+    return (zero_counts >= 2).all().item()
+
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/sparsity/utils/__init__.py b/vllm/model_executor/layers/sparsity/utils/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 7a6d7c90f34d5..f9666b3870fb1 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -104,7 +104,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
         param_data = param_data.narrow(self.output_dim, shard_offset,
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             tp_rank * shard_size, shard_size)                                 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)