From 96ae0eaeb270be8741abb30f2251670b4554e886 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 14:34:39 -0800
Subject: [PATCH 01/39] [doc] fix location of runllm widget (#10266)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/_static/custom.js | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index dac40ca2cfe75..18b502c786e1d 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -8,7 +8,9 @@ document.addEventListener("DOMContentLoaded", function () {
     script.setAttribute("version", "stable");
     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
     script.setAttribute("runllm-name", "vLLM");
-    script.setAttribute("runllm-position", "TOP_RIGHT");
+    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+    script.setAttribute("runllm-position-y", "20%");
+    script.setAttribute("runllm-position-x", "3%");
     script.setAttribute("runllm-assistant-id", "207");
   
     script.async = true;

From 18081451f9f5dd3ae476ff1e217d5573832b2604 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 14:43:52 -0800
Subject: [PATCH 02/39] [doc] improve debugging doc (#10270)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 060599680be25..77bf550601346 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -20,6 +20,10 @@ Hangs loading a model from disk
 If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. 
 It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
 
+.. note::
+
+    To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
+
 Model is too large
 ----------------------------------------
 If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism <https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving>`_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.

From 377b74fe877c7eb4632c2ca0778b9da9a5db8ae6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 15:06:48 -0800
Subject: [PATCH 03/39] Revert "[ci][build] limit cmake version" (#10271)

---
 Dockerfile.neuron                                | 2 +-
 Dockerfile.ppc64le                               | 2 +-
 docs/source/getting_started/cpu-installation.rst | 2 +-
 pyproject.toml                                   | 2 +-
 requirements-build.txt                           | 2 +-
 requirements-tpu.txt                             | 2 +-
 requirements-xpu.txt                             | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 47e40e015239a..2143315d2a078 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index c2a40000aab4b..b19c6ddec7948 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
     pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 6bf170b164fb8..69530fd778c55 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -62,7 +62,7 @@ Build from source
 .. code-block:: console
 
     $ pip install --upgrade pip
-    $ pip install cmake>=3.26,<=3.30 wheel packaging ninja "setuptools-scm>=8" numpy
+    $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
     $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
 - Finally, build and install vLLM CPU backend: 
diff --git a/pyproject.toml b/pyproject.toml
index 3be401daa44c7..3c8c46cc8621e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
-    "cmake>=3.26,<=3.30",
+    "cmake>=3.26",
     "ninja",
     "packaging",
     "setuptools>=61",
diff --git a/requirements-build.txt b/requirements-build.txt
index 64b92861df25d..fec01caaf25ef 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,5 +1,5 @@
 # Should be mirrored in pyproject.toml
-cmake>=3.26,<=3.30
+cmake>=3.26
 ninja
 packaging
 setuptools>=61
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 94a3225dcf479..f9a0770804e55 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for TPU
-cmake>=3.26,<=3.30
+cmake>=3.26
 ninja
 packaging
 setuptools-scm>=8
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index 479cb4bb18484..e41295792283f 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 ray >= 2.9
-cmake>=3.26,<=3.30
+cmake>=3.26
 ninja
 packaging
 setuptools-scm>=8

From 112fa0bbe5e5354f592a42913a4e6d72e0407b93 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Nov 2024 16:17:20 -0800
Subject: [PATCH 04/39] [V1] Fix CI tests on V1 engine (#10272)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_engine_core.py        | 3 +++
 tests/v1/engine/test_engine_core_client.py | 3 +++
 vllm/v1/engine/core.py                     | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 8451aac33acc4..b3692b594326a 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -27,6 +27,9 @@ def make_request() -> EngineCoreRequest:
         request_id=uuid.uuid4(),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
+        mm_data=None,
+        mm_placeholders=None,
+        mm_processor_kwargs=None,
         sampling_params=SamplingParams(),
         eos_token_id=None,
         arrival_time=time.time(),
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index d582101a1164f..7b241bf836a0e 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -29,6 +29,9 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         request_id=str(uuid.uuid4()),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
+        mm_data=None,
+        mm_placeholders=None,
+        mm_processor_kwargs=None,
         sampling_params=params,
         eos_token_id=None,
         arrival_time=time.time(),
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 808c3936b6c35..428483bdb29cb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -317,7 +317,7 @@ def process_input_socket(self, input_path: str):
 
         # Msgpack serialization decoding.
         decoder_add_req = PickleEncoder()
-        decoder_abort_req = msgpack.Decoder(list[str])
+        decoder_abort_req = PickleEncoder()
 
         with self.make_socket(input_path, zmq.constants.PULL) as socket:
             while True:

From 0d4ea3fb5c8c499b70cea8b1deee3e34a147cff1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 17:36:08 -0800
Subject: [PATCH 05/39] [core][distributed] use tcp store directly (#10275)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/distributed/test_utils.py | 26 ++++++++++++++++----------
 vllm/distributed/utils.py       | 28 +++++++++++++---------------
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 5d77d8abb4718..50444d3abfaf2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -43,12 +43,15 @@ def test_cuda_device_count_stateless():
 
 
 def cpu_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(
-            init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
     data = torch.tensor([rank])
     data = pg1.broadcast_obj(data, src=2)
     assert data.item() == 2
@@ -62,14 +65,17 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
 
 def gpu_worker(rank, WORLD_SIZE, port1, port2):
     torch.cuda.set_device(rank)
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
     pynccl1.disabled = False
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(
-            init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
         pynccl2.disabled = False
     data = torch.tensor([rank]).cuda()
@@ -89,7 +95,8 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def broadcast_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     if rank == 2:
@@ -101,7 +108,8 @@ def broadcast_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def allgather_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     data = pg1.all_gather_obj(rank)
@@ -109,8 +117,6 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2):
     pg1.barrier()
 
 
-# TODO: investigate why this test is flaky. It hangs during initialization.
-@pytest.mark.skip("Skip the test because it is flaky.")
 @multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize(
     "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index a77b41322f376..dcfcb848cbe06 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -9,7 +9,7 @@
 from typing import Any, Deque, Dict, Optional, Sequence, Tuple
 
 import torch
-from torch.distributed.rendezvous import rendezvous
+from torch.distributed import TCPStore
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -97,7 +97,6 @@ class StatelessProcessGroup:
     group. Only use it to communicate metadata between processes.
     For data-plane communication, create NCCL-related objects.
     """
-    prefix: str
     rank: int
     world_size: int
     store: torch._C._distributed_c10d.Store
@@ -127,7 +126,7 @@ def __post_init__(self):
     def send_obj(self, obj: Any, dst: int):
         """Send an object to a destination rank."""
         self.expire_data()
-        key = f"{self.prefix}/send_to/{dst}/{self.send_dst_counter[dst]}"
+        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
         self.store.set(key, pickle.dumps(obj))
         self.send_dst_counter[dst] += 1
         self.entries.append((key, time.time()))
@@ -147,8 +146,7 @@ def recv_obj(self, src: int) -> Any:
         """Receive an object from a source rank."""
         obj = pickle.loads(
             self.store.get(
-                f"{self.prefix}/send_to/{self.rank}/{self.recv_src_counter[src]}"
-            ))
+                f"send_to/{self.rank}/{self.recv_src_counter[src]}"))
         self.recv_src_counter[src] += 1
         return obj
 
@@ -159,14 +157,14 @@ def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
         """
         if self.rank == src:
             self.expire_data()
-            key = (f"{self.prefix}/broadcast_from/{src}/"
+            key = (f"broadcast_from/{src}/"
                    f"{self.broadcast_send_counter}")
             self.store.set(key, pickle.dumps(obj))
             self.broadcast_send_counter += 1
             self.entries.append((key, time.time()))
             return obj
         else:
-            key = (f"{self.prefix}/broadcast_from/{src}/"
+            key = (f"broadcast_from/{src}/"
                    f"{self.broadcast_recv_src_counter[src]}")
             recv_obj = pickle.loads(self.store.get(key))
             self.broadcast_recv_src_counter[src] += 1
@@ -194,7 +192,8 @@ def barrier(self):
 
     @staticmethod
     def create(
-        init_method: str,
+        host: str,
+        port: int,
         rank: int,
         world_size: int,
         data_expiration_seconds: int = 3600,
@@ -214,15 +213,14 @@ def create(
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
         """ # noqa
-        from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
-        timeout = _DEFAULT_PG_TIMEOUT
-
-        store, rank, world_size = next(
-            rendezvous(init_method, rank, world_size, timeout=timeout))
-        store.set_timeout(timeout)
+        store = TCPStore(
+            host_name=host,
+            port=port,
+            world_size=world_size,
+            is_master=(rank == 0),
+        )
 
         return StatelessProcessGroup(
-            prefix=init_method,
             rank=rank,
             world_size=world_size,
             store=store,

From bbd3e86926f15e59e4c62246b4b3185e71fe7ff2 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Nov 2024 20:53:13 -0800
Subject: [PATCH 06/39] [V1] Support VLMs with fine-grained scheduling (#9871)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/gpt2.py    |  11 +-
 vllm/model_executor/models/llama.py   |   7 +-
 vllm/model_executor/models/llava.py   |  46 +++---
 vllm/model_executor/models/opt.py     |   7 +-
 vllm/model_executor/models/phi3v.py   |  63 +++++---
 vllm/model_executor/models/qwen2.py   |   7 +-
 vllm/v1/core/encoder_cache_manager.py |  48 ++++++
 vllm/v1/core/scheduler.py             | 205 +++++++++++++++++++++++---
 vllm/v1/engine/core.py                |  10 ++
 vllm/v1/engine/mm_input_mapper.py     |  39 +++++
 vllm/v1/request.py                    |  41 +++++-
 vllm/v1/worker/gpu_model_runner.py    | 154 ++++++++++++++++---
 12 files changed, 542 insertions(+), 96 deletions(-)
 create mode 100644 vllm/v1/core/encoder_cache_manager.py
 create mode 100644 vllm/v1/engine/mm_input_mapper.py

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index fcff7ec2e01eb..adf2a7a51f737 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -216,9 +216,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor],
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            inputs_embeds = self.wte(input_ids)
+            if inputs_embeds is None:
+                inputs_embeds = self.wte(input_ids)
             position_embeds = self.wpe(position_ids)
             hidden_states = inputs_embeds + position_embeds
         else:
@@ -263,6 +265,9 @@ def __init__(
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -270,9 +275,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2472128976d88..8aed0fead18f9 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -538,6 +538,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             normalize=False,
             softmax=False)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -545,9 +548,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index ca963fa1c52ea..af712bf8f9506 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import NestedTensors
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -448,6 +449,25 @@ def _process_image_input(self,
         image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
+    def process_mm_inputs(self, **kwargs):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        vision_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if vision_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -455,6 +475,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LLaVA-1.5.
@@ -494,24 +515,13 @@ def forward(
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.config.image_token_index)
-            else:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.process_mm_inputs(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 58b6107eba347..997fe642439e6 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -360,6 +360,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -367,9 +370,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4b5dc944bce4b..de03d28638cda 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -39,6 +39,7 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import is_list_of
@@ -500,15 +501,20 @@ def input_processor_for_phi3v(ctx: InputContext,
 
     # TODO: Move this to utils or integrate with clip.
     new_token_ids: List[int] = []
+    placeholder_ranges: List[PlaceholderRange] = []
     placeholder_idx = 0
     while merged_token_ids:
         token_id = merged_token_ids.pop(0)
         if token_id == _IMAGE_TOKEN_ID:
-            new_token_ids.extend(
-                repeat_and_pad_token(
-                    _IMAGE_TOKEN_ID,
-                    repeat_count=image_feature_size[placeholder_idx],
-                ))
+            replacement_ids = repeat_and_pad_token(
+                _IMAGE_TOKEN_ID,
+                repeat_count=image_feature_size[placeholder_idx],
+            )
+            placeholder_ranges.append({
+                "offset": len(new_token_ids),
+                "length": len(replacement_ids)
+            })
+            new_token_ids.extend(replacement_ids)
             placeholder_idx += 1
         else:
             new_token_ids.append(token_id)
@@ -516,7 +522,8 @@ def input_processor_for_phi3v(ctx: InputContext,
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
@@ -669,32 +676,42 @@ def _process_image_input(
 
         return image_embeds
 
+    def process_mm_inputs(self, **kwargs):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        vision_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        if vision_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.image_token_id)
+        return inputs_embeds
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object):
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.embed_tokens(input_ids)
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.image_token_id)
-            else:
-                inputs_embeds = self.language_model.model.embed_tokens(
-                    input_ids)
-
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.process_mm_inputs(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 2195ce49aa9a7..b623c576bb673 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -441,6 +441,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -448,9 +451,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
new file mode 100644
index 0000000000000..845bd5ea05e3c
--- /dev/null
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -0,0 +1,48 @@
+from typing import Dict, List, Set, Tuple
+
+from vllm.v1.request import Request
+
+
+class EncoderCacheManager:
+
+    def __init__(self, cache_size: int):
+        self.cache_size = cache_size
+        self.num_free_slots = cache_size
+        # req_id -> cached input ids
+        self.cached: Dict[str, Set[int]] = {}
+        # List of [req_id, input_id]
+        self.freed: List[Tuple[str, int]] = []
+
+    def has_cache(self, request: Request, input_id: int) -> bool:
+        req_id = request.request_id
+        return req_id in self.cached and input_id in self.cached[req_id]
+
+    def can_allocate(self, request: Request, input_id: int) -> bool:
+        num_tokens = request.get_num_encoder_tokens(input_id)
+        return num_tokens <= self.num_free_slots
+
+    def allocate(self, request: Request, input_id: int) -> None:
+        req_id = request.request_id
+        if req_id not in self.cached:
+            self.cached[req_id] = set()
+        self.cached[req_id].add(input_id)
+        self.num_free_slots -= request.get_num_encoder_tokens(input_id)
+
+    def get_cached_input_ids(self, request: Request) -> Set[int]:
+        return self.cached.get(request.request_id, set())
+
+    def free(self, request: Request, input_id: int) -> None:
+        req_id = request.request_id
+        if req_id not in self.cached:
+            return
+
+        self.cached[req_id].discard(input_id)
+        if len(self.cached[req_id]) == 0:
+            del self.cached[req_id]
+        self.num_free_slots += request.get_num_encoder_tokens(input_id)
+        self.freed.append((req_id, input_id))
+
+    def get_freed_ids(self) -> List[Tuple[str, int]]:
+        freed = self.freed
+        self.freed = []
+        return freed
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index ee860e792281d..ba50a9786d805 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,16 +1,21 @@
 from collections import deque
 from dataclasses import dataclass
-from typing import Deque, Dict, Iterable, List, Optional, Set, Union
+from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
+                    Tuple, Union)
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import SamplingParams
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.base import PlaceholderRange
+
 logger = init_logger(__name__)
 
 
@@ -61,12 +66,20 @@ def __init__(
         # Request id -> RunningRequestData
         self.running_reqs_data: Dict[str, RunningRequestData] = {}
 
-    def schedule(self) -> "SchedulerOutput":
-        scheduled_new_reqs: List[Request] = []
-        scheduled_resumed_reqs: List[Request] = []
-        scheduled_running_reqs: List[Request] = []
-        preempted_reqs: List[Request] = []
+        # Encoder-related.
+        # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
+        # projector if needed). Currently, we assume that the encoder also
+        # has the Transformer architecture (e.g., ViT).
+        # FIXME(woosuk): Below are placeholder values. We need to calculate the
+        # actual values from the configurations.
+        self.max_num_encoder_input_tokens = 2048
+        # NOTE(woosuk): For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized and used, regardless of
+        # the cache size. This is because the memory space for the encoder cache
+        # is preallocated in the profiling run.
+        self.encoder_cache_manager = EncoderCacheManager(cache_size=2048)
 
+    def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
         # Each request just has the num_computed_tokens and num_tokens,
@@ -74,23 +87,45 @@ def schedule(self) -> "SchedulerOutput":
         # At each step, the scheduler tries to assign tokens to the requests
         # so that each request's num_computed_tokens can catch up its
         # num_tokens. This is general enough to cover chunked prefills,
-        # prefix caching, and the "jump forward" optimization in the future.
+        # prefix caching, and the "jump decoding" optimization in the future.
+
+        scheduled_new_reqs: List[Request] = []
+        scheduled_resumed_reqs: List[Request] = []
+        scheduled_running_reqs: List[Request] = []
+        preempted_reqs: List[Request] = []
 
         req_to_new_block_ids: Dict[str, List[int]] = {}
         num_scheduled_tokens: Dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
+        # Encoder-related.
+        scheduled_encoder_inputs: Dict[str, List[int]] = {}
+        encoder_budget = self.max_num_encoder_input_tokens
 
         # First, schedule the RUNNING requests.
+        # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be
+        # in the "partial" state, where the request has some tokens computed
+        # but not all. The constraint is due to the persistent batch in the
+        # V1 model runner.
+        # TODO(woosuk): Remove this constraint after refactoring model runner.
+        has_partial_request = False
         req_index = 0
         while req_index < len(self.running):
-            if token_budget == 0:
-                break
-
+            # Only the last request in the RUNNING queue can be "partial".
+            assert not has_partial_request
+            assert token_budget > 0
             request = self.running[req_index]
             num_new_tokens = request.num_tokens - request.num_computed_tokens
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
 
+            # Schedule encoder inputs.
+            encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
+                self._try_schedule_encoder_inputs(request,
+                                                  request.num_computed_tokens,
+                                                  num_new_tokens,
+                                                  encoder_budget))
+            assert num_new_tokens > 0
+
             while True:
                 new_blocks = self.kv_cache_manager.append_slots(
                     request, num_new_tokens)
@@ -106,22 +141,40 @@ def schedule(self) -> "SchedulerOutput":
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
                         # No more request to preempt.
+                        can_schedule = False
                         break
                 else:
                     # The request can be scheduled.
-                    scheduled_running_reqs.append(request)
-
-                    req_to_new_block_ids[request.request_id] = [
-                        b.block_id for b in new_blocks
-                    ]
-                    num_scheduled_tokens[request.request_id] = num_new_tokens
-                    token_budget -= num_new_tokens
-                    req_index += 1
+                    can_schedule = True
                     break
+            if not can_schedule:
+                break
+
+            # Schedule the request.
+            scheduled_running_reqs.append(request)
+            req_to_new_block_ids[request.request_id] = [
+                b.block_id for b in new_blocks
+            ]
+            num_scheduled_tokens[request.request_id] = num_new_tokens
+            token_budget -= num_new_tokens
+            req_index += 1
+            has_partial_request = (request.num_computed_tokens + num_new_tokens
+                                   < request.num_tokens)
+
+            # Encoder-related.
+            if encoder_inputs_to_schedule:
+                scheduled_encoder_inputs[request.request_id] = (
+                    encoder_inputs_to_schedule)
+                # Allocate the encoder cache.
+                for i in encoder_inputs_to_schedule:
+                    self.encoder_cache_manager.allocate(request, i)
+                encoder_budget = new_encoder_budget
 
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting:
+                if has_partial_request:
+                    break
                 if len(self.running) == self.max_num_running_reqs:
                     break
                 if token_budget == 0:
@@ -149,12 +202,21 @@ def schedule(self) -> "SchedulerOutput":
                     computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
+
+                # Schedule encoder inputs.
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled.
+                    break
+
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request, num_new_tokens, computed_blocks)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break
-                request.num_computed_tokens = num_computed_tokens
 
                 self.waiting.popleft()
                 self.running.append(request)
@@ -172,6 +234,18 @@ def schedule(self) -> "SchedulerOutput":
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
+                request.num_computed_tokens = num_computed_tokens
+                has_partial_request = (num_computed_tokens + num_new_tokens <
+                                       request.num_tokens)
+
+                # Encoder-related.
+                if encoder_inputs_to_schedule:
+                    scheduled_encoder_inputs[request.request_id] = (
+                        encoder_inputs_to_schedule)
+                    # Allocate the encoder cache.
+                    for i in encoder_inputs_to_schedule:
+                        self.encoder_cache_manager.allocate(request, i)
+                    encoder_budget = new_encoder_budget
 
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
@@ -205,12 +279,14 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_running_reqs=running_reqs_data,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
+            scheduled_encoder_inputs=scheduled_encoder_inputs,
             preempted_req_ids=preempted_req_ids,
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
             # the previous and the current steps.
             finished_req_ids=self.finished_req_ids,
+            free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
         )
 
         self.finished_req_ids = set()
@@ -234,6 +310,72 @@ def _make_running_request_data(
             self.running_reqs_data[request.request_id] = req_data
         return req_data
 
+    def _try_schedule_encoder_inputs(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+        num_new_tokens: int,
+        encoder_budget: int,
+    ) -> Tuple[List[int], int, int]:
+        """
+        Determine which encoder inputs need to be scheduled in the current step,
+        and update `num_new_tokens` and encoder token budget accordingly.
+
+        An encoder input will be scheduled if:
+        - Its output tokens overlap with the range of tokens being computed
+        in this step, i.e.,
+        [num_computed_tokens, num_computed_tokens + num_new_tokens).
+        - It is not already computed and stored in the encoder cache.
+        - There is sufficient encoder token budget to process it.
+        - The encoder cache has space to store it.
+
+        If an encoder input cannot be scheduled due to cache or budget
+        limitations, the method adjusts `num_new_tokens` to schedule only the
+        decoder tokens up to just before the unschedulable encoder input.
+        """
+        if not request.has_encoder_inputs():
+            return [], num_new_tokens, encoder_budget
+
+        encoder_inputs_to_schedule: List[int] = []
+        mm_positions = request.mm_positions
+        assert mm_positions is not None
+        assert len(mm_positions) > 0
+        for i, pos_info in enumerate(mm_positions):
+            start_pos = pos_info["offset"]
+            num_encoder_tokens = pos_info["length"]
+
+            # The encoder output is needed if the two ranges overlap:
+            # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
+            # [start_pos, start_pos + num_encoder_tokens)
+            if start_pos >= num_computed_tokens + num_new_tokens:
+                # The encoder input is not needed in this step.
+                break
+            if start_pos + num_encoder_tokens <= num_computed_tokens:
+                # The encoder input is already computed and stored
+                # in the decoder's KV cache.
+                continue
+
+            if self.encoder_cache_manager.has_cache(request, i):
+                # The encoder input is already computed and cached.
+                continue
+            if not self.encoder_cache_manager.can_allocate(request, i):
+                # The encoder cache is full. We can only schedule the decoder
+                # tokens just before the encoder input.
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+            if num_encoder_tokens > encoder_budget:
+                # The encoder budget is exhausted. We can only schedule the
+                # decoder tokens up until the encoder input.
+                # NOTE(woosuk): We assume that the encoder tokens should be
+                # processed altogether, as the encoder usually uses
+                # bidirectional attention.
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+
+            encoder_budget -= num_encoder_tokens
+            encoder_inputs_to_schedule.append(i)
+        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
     def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
@@ -251,6 +393,17 @@ def update_from_output(
             # the request generates output tokens. Otherwise, we ignore the
             # sampler output for the request.
             assert request.num_computed_tokens <= request.num_tokens
+
+            cached_encoder_input_ids = (
+                self.encoder_cache_manager.get_cached_input_ids(request))
+            for input_id in list(cached_encoder_input_ids):
+                start_pos = request.mm_positions[input_id]["offset"]
+                num_tokens = request.mm_positions[input_id]["length"]
+                if start_pos + num_tokens <= request.num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    self.encoder_cache_manager.free(request, input_id)
+
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
@@ -355,7 +508,8 @@ class NewRequestData:
     req_id: str
     prompt_token_ids: List[int]
     prompt: Optional[str]
-    multi_modal_data: Optional[MultiModalDataDict]
+    mm_inputs: List["MultiModalKwargs"]
+    mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     block_ids: List[int]
     num_computed_tokens: int
@@ -369,9 +523,10 @@ def from_request(
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
-            prompt_token_ids=request.inputs["prompt_token_ids"],
-            prompt=request.inputs.get("prompt"),
-            multi_modal_data=request.inputs.get("multi_modal_data"),
+            prompt_token_ids=request.prompt_token_ids,
+            prompt=request.prompt,
+            mm_inputs=request.mm_inputs,
+            mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
             num_computed_tokens=num_computed_tokens,
@@ -429,6 +584,8 @@ class SchedulerOutput:
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
+    scheduled_encoder_inputs: Dict[str, List[int]]
 
     preempted_req_ids: Set[str]
     finished_req_ids: Set[str]
+    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 428483bdb29cb..35ed131d50de9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -17,6 +17,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreRequest, EngineCoreRequestType)
+from vllm.v1.engine.mm_input_mapper import MMInputMapper
 from vllm.v1.executor.gpu_executor import GPUExecutor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
@@ -65,6 +66,9 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
+        # Set up multimodal input mapper (e.g., convert PIL images to tensors).
+        self.mm_input_mapper = MMInputMapper(vllm_config.model_config)
+
         # Setup scheduler.
         self.scheduler = Scheduler(vllm_config.scheduler_config,
                                    vllm_config.cache_config,
@@ -93,6 +97,12 @@ def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
 
         req = Request.from_engine_core_request(request)
+        # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may
+        # take 10-50 ms, which can cause a spike in the latency. We should
+        # consider moving this to a separate thread.
+        if req.mm_data:
+            req.mm_inputs = self.mm_input_mapper.process_inputs(
+                req.mm_data, req.mm_processor_kwargs)
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
new file mode 100644
index 0000000000000..594c973678235
--- /dev/null
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -0,0 +1,39 @@
+from typing import Any, Dict, List, Optional
+
+from vllm.config import ModelConfig
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalKwargs, MultiModalRegistry)
+
+
+class MMInputMapper:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ):
+        self.mm_registry = mm_registry
+        self.multi_modal_input_mapper = mm_registry.create_input_mapper(
+            model_config)
+        self.mm_registry.init_mm_limits_per_prompt(model_config)
+
+    def process_inputs(
+        self,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Dict[str, Any]],
+    ) -> List[MultiModalKwargs]:
+        image_inputs = mm_data["image"]
+        if not isinstance(image_inputs, list):
+            image_inputs = [image_inputs]
+
+        # Process each image input separately so that later we can schedule
+        # them in a fine-grained manner.
+        mm_inputs: List[MultiModalKwargs] = []
+        num_images = len(image_inputs)
+        for i in range(num_images):
+            mm_input = self.multi_modal_input_mapper(
+                {"image": [image_inputs[i]]},
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+            mm_inputs.append(mm_input)
+        return mm_inputs
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 00e5aea92a8df..f35cf738c89bf 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,6 +3,7 @@
 
 from vllm.inputs.data import DecoderOnlyInputs
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
@@ -47,14 +48,30 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
+        # Raw multimodal data before the mm input mapper (e.g., PIL images).
+        self.mm_data = inputs.get("multi_modal_data")
+        self.mm_processor_kwargs = inputs.get("mm_processor_kwargs")
+        mm_positions = inputs.get("multi_modal_placeholders")
+        if mm_positions:
+            # FIXME(woosuk): Support other modalities.
+            self.mm_positions = mm_positions.get("image", [])
+        else:
+            self.mm_positions = []
+        # Output of the mm input mapper (e.g., image tensors).
+        self.mm_inputs: List[MultiModalKwargs] = []
+
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
-
         return cls(
             request_id=request.request_id,
-            inputs=DecoderOnlyInputs(type="token",
-                                     prompt_token_ids=request.prompt_token_ids,
-                                     prompt=request.prompt),
+            inputs=DecoderOnlyInputs(
+                type="token",
+                prompt_token_ids=request.prompt_token_ids,
+                prompt=request.prompt,
+                multi_modal_data=request.mm_data,
+                multi_modal_placeholders=request.mm_placeholders,
+                mm_processor_kwargs=request.mm_processor_kwargs,
+            ),
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
@@ -96,9 +113,21 @@ def is_finished(self) -> bool:
     def get_finished_reason(self) -> Union[str, None]:
         return RequestStatus.get_finished_reason(self.status)
 
+    def has_encoder_inputs(self) -> bool:
+        return self.mm_data is not None
+
+    @property
+    def num_encoder_inputs(self) -> int:
+        return len(self.mm_positions)
+
+    def get_num_encoder_tokens(self, input_id: int) -> int:
+        assert input_id < len(self.mm_positions)
+        num_tokens = self.mm_positions[input_id]["length"]
+        return num_tokens
+
 
 class RequestStatus(enum.IntEnum):
-    """Status of a sequence."""
+    """Status of a request."""
     WAITING = 0
     RUNNING = 1
     PREEMPTED = 2
@@ -119,7 +148,7 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
 
 
 # Mapping of finished statuses to their finish reasons.
-# NOTE: The ignored sequences are the sequences whose prompt lengths
+# NOTE: The ignored requests are the requests whose prompt lengths
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index db676e2819bf4..81480786a09e1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,7 +1,7 @@
 import os
 import time
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
 import numpy as np
 import torch
@@ -14,9 +14,10 @@
 from vllm.compilation.levels import CompilationLevel
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
+from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MultiModalKwargs
 from vllm.plugins import set_compilation_config
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
@@ -27,6 +28,7 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 
 if TYPE_CHECKING:
+    from vllm.multimodal.base import PlaceholderRange
     from vllm.v1.core.scheduler import SchedulerOutput
 
 logger = init_logger(__name__)
@@ -37,8 +39,8 @@ class GPUModelRunner:
     def __init__(
         self,
         vllm_config: VllmConfig,
+        input_registry: InputRegistry = INPUT_REGISTRY,
     ):
-        # TODO: use ModelRunnerBase.__init__(self, vllm_config=vllm_config)
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -75,10 +77,16 @@ def __init__(
             parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
+        self.hidden_size = model_config.get_hidden_size()
+
+        # Multi-modal data support
+        self.input_registry = input_registry
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
         self.kv_caches: List[torch.Tensor] = []
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
 
         # Request states.
         self.requests: Dict[str, CachedRequestState] = {}
@@ -96,18 +104,28 @@ def __init__(
                                and not self.model_config.enforce_eager)
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)]
-        self.input_ids = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int32,
-                                     device=self.device)
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
+        self.inputs_embeds = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=self.device)
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.encoder_cache.pop(req_id, None)
+
+        # Free the cached encoder outputs.
+        for req_id, input_id in scheduler_output.free_encoder_input_ids:
+            encoder_outputs = self.encoder_cache.get(req_id)
+            if encoder_outputs is not None:
+                encoder_outputs.pop(input_id, None)
+                if not encoder_outputs:
+                    self.encoder_cache.pop(req_id, None)
 
         # Remove the requests from the persistent batch.
         stopped_req_ids = set().union(
@@ -156,7 +174,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 req_id=req_id,
                 prompt_token_ids=req_data.prompt_token_ids,
                 prompt=req_data.prompt,
-                multi_modal_data=req_data.multi_modal_data,
+                mm_inputs=req_data.mm_inputs,
+                mm_positions=req_data.mm_positions,
                 sampling_params=sampling_params,
                 generator=generator,
                 block_ids=req_data.block_ids,
@@ -285,11 +304,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         seq_start_loc_np[0] = 0
         np.cumsum(seq_lens, out=seq_start_loc_np[1:])
 
-        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
-                                                          non_blocking=True)
+        input_ids = input_ids.to(self.device, non_blocking=True)
         self.positions[:total_num_scheduled_tokens].copy_(positions,
                                                           non_blocking=True)
-
         query_start_loc = query_start_loc.to(self.device, non_blocking=True)
         seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
@@ -308,7 +325,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return attn_metadata, logits_indices
+        return input_ids, attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -325,13 +342,91 @@ def _prepare_sampling(
         sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
         return sampling_metadata
 
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs: List[MultiModalKwargs] = []
+        req_input_ids: List[Tuple[int, int]] = []
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+            for input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[input_id])
+                req_input_ids.append((req_id, input_id))
+        batched_mm_inputs = MultiModalKwargs.batch(mm_inputs)
+        batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
+                                                       device=self.device)
+
+        # Run the encoder.
+        # `encoder_outputs` is either of the following:
+        # 1. A tensor of shape [num_images, feature_size, hidden_size]
+        # in case when feature_size is fixed across all images.
+        # 2. A list (length: num_images) of tensors, each of shape
+        # [feature_size, hidden_size] in case when the feature size is
+        # dynamic depending on input images.
+        encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs)
+
+        # Cache the encoder outputs.
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
+
+    def _gather_encoder_outputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> List[torch.Tensor]:
+        encoder_outputs: List[torch.Tensor] = []
+        num_reqs = self.input_batch.num_reqs
+        for req_id in self.input_batch.req_ids[:num_reqs]:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                encoder_output = self.encoder_cache[req_id][i]
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
+
     @torch.inference_mode()
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
-        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+
+        # Run the encoder.
+        self._execute_encoder(scheduler_output)
+        encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+
+        # Prepare the decoder inputs.
+        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
+            scheduler_output)
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -343,12 +438,26 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        # Get the inputs embeds.
+        if encoder_outputs:
+            inputs_embeds = self.model.get_input_embeddings(
+                input_ids, encoder_outputs)
+        else:
+            inputs_embeds = self.model.get_input_embeddings(input_ids)
+        # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
+        # always use embeddings (rather than token ids) as input to the model.
+        # TODO(woosuk): Avoid the copy. Optimize.
+        self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+
+        # Run the decoder.
+        # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata):
             hidden_states = self.model(
-                input_ids=self.input_ids[:num_input_tokens],
+                input_ids=None,
                 positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
+                inputs_embeds=self.inputs_embeds[:num_input_tokens],
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
@@ -440,13 +549,16 @@ def _dummy_run(self, model: nn.Module, num_tokens: int) -> None:
         with set_forward_context(None):  # noqa: SIM117
             with set_compile_context(self.cudagraph_batch_sizes):
                 # Trigger compilation for general shape.
-                model(self.input_ids,
-                      self.positions,
-                      dummy_kv_caches,
-                      attn_metadata=None)
+                model(input_ids=None,
+                      positions=self.positions,
+                      kv_caches=dummy_kv_caches,
+                      attn_metadata=None,
+                      inputs_embeds=self.inputs_embeds)
 
     @torch.inference_mode()
     def profile_run(self) -> None:
+        # TODO(woosuk): Profile the max memory usage of the encoder and
+        # the encoder cache.
         self._dummy_run(self.model, self.max_num_tokens)
         torch.cuda.synchronize()
 
@@ -468,10 +580,11 @@ def capture_model(self) -> None:
             # can reuse the memory pool allocated for the large shapes.
             for num_tokens in reversed(self.cudagraph_batch_sizes):
                 self.model(
-                    self.input_ids[:num_tokens],
-                    self.positions[:num_tokens],
+                    input_ids=None,
+                    positions=self.positions[:num_tokens],
                     kv_caches=self.kv_caches,
                     attn_metadata=None,
+                    inputs_embeds=self.inputs_embeds[:num_tokens],
                 )
 
         end_time = time.perf_counter()
@@ -506,7 +619,8 @@ class CachedRequestState:
     req_id: str
     prompt_token_ids: List[int]
     prompt: Optional[str]
-    multi_modal_data: Optional["MultiModalDataDict"]
+    mm_inputs: List[MultiModalKwargs]
+    mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 

From 56a955e7748e497d8c24c79a76c75f3f982fab4a Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 13 Nov 2024 00:54:10 -0500
Subject: [PATCH 07/39] Bump to compressed-tensors v0.8.0 (#10279)

Signed-off-by: Dipika <dipikasikka1@gmail.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index ef5ed8b645158..acb766d25a2d9 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -31,4 +31,4 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.7.1 # required for compressed-tensors
+compressed-tensors == 0.8.0 # required for compressed-tensors
\ No newline at end of file

From 032fcf16ae9d924cc98a083c3c8464173f87a49e Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Tue, 12 Nov 2024 21:54:52 -0800
Subject: [PATCH 08/39] [Doc] Fix typo in arg_utils.py (#10264)

Signed-off-by: Xin Yang <xyang19@gmail.com>
---
 vllm/engine/arg_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 27f62b0008578..31aa8c5908719 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -626,8 +626,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=int,
             default=EngineArgs.max_cpu_loras,
             help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_num_seqs. '
-                  'Defaults to max_num_seqs.'))
+                  'Must be >= than max_loras. '
+                  'Defaults to max_loras.'))
         parser.add_argument(
             '--fully-sharded-loras',
             action='store_true',

From 3945c82346dae3129213607663bfd17edd905fef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=B5=E8=84=91=E6=98=9F=E4=BA=BA?= <kerorek@outlook.com>
Date: Wed, 13 Nov 2024 15:07:22 +0800
Subject: [PATCH 09/39] [Model] Add support for Qwen2-VL video embeddings input
 & multiple image embeddings input with varied resolutions (#10221)

Signed-off-by: imkero <kerorek@outlook.com>
---
 docs/source/models/supported_models.rst       |   2 +-
 .../vision_language/test_qwen2_vl.py          | 428 ++++++++++++++++++
 vllm/model_executor/models/qwen2_vl.py        | 180 ++++++--
 3 files changed, 578 insertions(+), 32 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_qwen2_vl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 5a474043078db..ca894819f2c26 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -538,7 +538,7 @@ Text Generation
     - ✅︎
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
-    - T + I\ :sup:`E+` + V\ :sup:`+`
+    - T + I\ :sup:`E+` + V\ :sup:`E+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
new file mode 100644
index 0000000000000..718c675b86fb4
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -0,0 +1,428 @@
+from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
+
+import numpy.typing as npt
+import pytest
+import torch
+from PIL import Image
+
+from vllm.entrypoints.llm import LLM
+from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+                                   sample_frames_from_video)
+
+from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
+                          PromptVideoInput, VllmRunner)
+from ...utils import check_logprobs_close
+
+models = ["Qwen/Qwen2-VL-2B-Instruct"]
+target_dtype = "half"
+
+IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+def qwen2_vl_chat_template(*query):
+    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    qwen2_vl_chat_template(
+        IMAGE_PLACEHOLDER,
+        "What is the biggest text's content in this image?",
+    ),
+    "cherry_blossom":
+    qwen2_vl_chat_template(
+        IMAGE_PLACEHOLDER,
+        "What is the season shown in this image? ",
+        "Reply with a short sentence (no more than 20 words)",
+    ),
+})
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "sample_demo_1":
+    qwen2_vl_chat_template(
+        VIDEO_PLACEHOLDER,
+        "Describe this video with a short sentence ",
+        "(no more than 20 words)",
+    ),
+})
+
+MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
+    IMAGE_PLACEHOLDER,
+    IMAGE_PLACEHOLDER,
+    "Describe these two images separately. ",
+    "For each image, reply with a short sentence ",
+    "(no more than 10 words).",
+)
+
+
+class Qwen2VLPromptImageEmbeddingInput(TypedDict):
+    image_embeds: torch.Tensor
+    image_grid_thw: torch.Tensor
+
+
+class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
+    video_embeds: torch.Tensor
+    video_grid_thw: torch.Tensor
+
+
+def batch_make_image_embeddings(
+        image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
+        llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
+    """batched image embeddings for Qwen2-VL
+
+    This will infer all images' embeddings in a single batch, 
+      and split the result according to input batches.
+
+    image_batches:
+      - Single-image batches: `List[Image.Image]`
+      - Multiple-image batches: `List[List[Image.Image]]]`
+    
+    returns: `List[Qwen2VLPromptImageEmbeddingInput]`
+    """
+
+    image_batches_: List[Any] = image_batches[:]
+
+    # convert single-image batches to multiple-image batches
+    for idx in range(len(image_batches_)):
+        if not isinstance(image_batches_[idx], list):
+            image_batches_[idx] = [image_batches_[idx]]
+
+        assert isinstance(image_batches_[idx], list)
+
+    # append all images into a list (as a batch)
+    images: List[Image.Image] = []
+    for image_batch in image_batches_:
+        images += image_batch
+
+    # image to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor \
+        .preprocess(images=images, return_tensors="pt") \
+        .data
+    pixel_values = preprocess_result["pixel_values"]
+    image_grid_thw = preprocess_result["image_grid_thw"]
+
+    # pixel values to embeddinds & grid_thws
+    with torch.no_grad():
+        visual = llm.llm_engine.model_executor.driver_worker. \
+            model_runner.model.visual
+
+        pixel_values_on_device = pixel_values.to(visual.device,
+                                                 dtype=visual.dtype)
+        image_grid_thw_on_device = image_grid_thw.to(visual.device,
+                                                     dtype=torch.int64)
+        image_embeds = visual(pixel_values_on_device,
+                              grid_thw=image_grid_thw_on_device)
+
+    # split into original batches
+    result: List[Qwen2VLPromptImageEmbeddingInput] = []
+    image_counter = 0
+    embed_counter = 0
+    for image_batch in image_batches_:
+        cur_batch_image_count = len(image_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum([
+            grid_thw.prod() // merge_size // merge_size
+            for grid_thw in image_grid_thw[image_counter:image_counter +
+                                           cur_batch_image_count]
+        ])
+
+        result.append({
+            "image_embeds":
+            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
+            "image_grid_thw":
+            image_grid_thw[image_counter:image_counter +
+                           cur_batch_image_count],
+        })
+
+        embed_counter += cur_batch_embed_len
+        image_counter += cur_batch_image_count
+
+    # ensure we don't lost any images or embeddings
+    assert embed_counter == image_embeds.size(0)
+    assert image_counter == image_grid_thw.size(0)
+    assert len(image_batches) == len(result)
+
+    return result
+
+
+def batch_make_video_embeddings(
+        video_batches: PromptVideoInput, processor,
+        llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+    """batched video embeddings for Qwen2-VL
+
+    A NDArray represents a single video's all frames.
+
+    This will infer all videos' embeddings in a single batch, 
+      and split the result according to input batches.
+
+    video_batches:
+      - Single-video batches: `List[NDArray]`
+      - Multiple-video batches: `List[List[NDArray]]`
+    """
+
+    video_batches_: List[Any] = video_batches[:]
+
+    for idx in range(len(video_batches_)):
+        if not isinstance(video_batches_[idx], list):
+            single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
+            video_batches_[idx] = single_video_batch
+
+        assert isinstance(video_batches_[idx], list)
+
+    # append all videos into a list (as a batch)
+    videos: List[npt.NDArray] = []
+    for video_batch in video_batches_:
+        videos += video_batch
+
+    # video to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor \
+        .preprocess(images=None, videos=videos, return_tensors="pt") \
+        .data
+    pixel_values = preprocess_result["pixel_values_videos"]
+    video_grid_thw = preprocess_result["video_grid_thw"]
+
+    # pixel values to embeddinds & grid_thws
+    with torch.no_grad():
+        visual = llm.llm_engine.model_executor.driver_worker.\
+            model_runner.model.visual
+
+        pixel_values_on_device = pixel_values.to(visual.device,
+                                                 dtype=visual.dtype)
+        video_grid_thw_on_device = video_grid_thw.to(visual.device,
+                                                     dtype=torch.int64)
+        video_embeds = visual(pixel_values_on_device,
+                              grid_thw=video_grid_thw_on_device)
+
+    # split into original batches
+    result: List[Qwen2VLPromptVideoEmbeddingInput] = []
+    video_counter = 0
+    embed_counter = 0
+    for video_batch in video_batches_:
+        cur_batch_video_count = len(video_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum([
+            grid_thw.prod() // merge_size // merge_size
+            for grid_thw in video_grid_thw[video_counter:video_counter +
+                                           cur_batch_video_count]
+        ])
+
+        result.append({
+            "video_embeds":
+            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
+            "video_grid_thw":
+            video_grid_thw[video_counter:video_counter +
+                           cur_batch_video_count],
+        })
+
+        embed_counter += cur_batch_embed_len
+        video_counter += cur_batch_video_count
+
+    # ensure we don't lost any videos or embeddings
+    assert embed_counter == video_embeds.size(0)
+    assert video_counter == video_grid_thw.size(0)
+    assert len(video_batches) == len(result)
+
+    return result
+
+
+def run_test(
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between
+    original image/video input and image/video embeddings input.
+    """
+    from transformers import AutoProcessor  # noqa: F401
+
+    processor = AutoProcessor.from_pretrained(model)
+
+    # NOTE:
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     task="generate",
+                     max_model_len=4000,
+                     max_num_seqs=3,
+                     dtype=dtype,
+                     limit_mm_per_prompt={
+                         "image": mm_limit,
+                         "video": mm_limit
+                     },
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+
+        outputs_per_case_for_original_input = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images or None,
+                                                videos=videos or None)
+            for prompts, images, videos in inputs
+        ]
+
+        outputs_per_case_for_embeddings_input = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=batch_make_image_embeddings(
+                    images, processor, vllm_model.model) if images else None,
+                videos=batch_make_video_embeddings(
+                    videos, processor, vllm_model.model) if videos else None)
+            for prompts, images, videos in inputs
+        ]
+
+    for outputs_for_original_input, \
+        outputs_for_embeddings_input \
+        in zip(outputs_per_case_for_original_input,
+            outputs_per_case_for_embeddings_input):
+        check_logprobs_close(
+            outputs_0_lst=outputs_for_original_input,
+            outputs_1_lst=outputs_for_embeddings_input,
+            name_0="original_input",
+            name_1="embeddings_input",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
+                                         size_factors, dtype: str,
+                                         max_tokens: int,
+                                         num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: List[Tuple[
+        List[str], PromptImageInput, PromptVideoInput]] = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            [],
+        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
+
+    run_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        [],
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
+                                                  model, size_factors,
+                                                  dtype: str, max_tokens: int,
+                                                  num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: List[Tuple[List[str], PromptImageInput,
+                                PromptVideoInput]] = [(
+                                    [MULTIIMAGE_PROMPT for _ in size_factors],
+                                    [[
+                                        rescale_image_size(image, factor)
+                                        for image in images
+                                    ] for factor in size_factors],
+                                    [],
+                                )]
+
+    run_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.25, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
+                                         size_factors, dtype: str,
+                                         max_tokens: int,
+                                         num_logprobs: int) -> None:
+    num_frames = 4
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    inputs_per_case: List[Tuple[
+        List[str], PromptImageInput, PromptVideoInput]] = [(
+            [prompt for _ in size_factors],
+            [],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
+
+    run_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 13109758767df..1b162e7df8578 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -79,7 +79,7 @@
 
 class Qwen2VLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values: torch.Tensor
     """Shape:
     `(num_patches, num_channels * patch_size * patch_size)`
     """
@@ -92,9 +92,22 @@ class Qwen2VLImagePixelInputs(TypedDict):
 
 class Qwen2VLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-    `hidden_size` must match the hidden size of language model backbone.
+    image_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
     """
 
 
@@ -102,7 +115,8 @@ class Qwen2VLImageEmbeddingInputs(TypedDict):
                            Qwen2VLImageEmbeddingInputs]
 
 
-class Qwen2VLVideoInputs(TypedDict):
+class Qwen2VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
     pixel_values_videos: torch.Tensor
     """Shape:
     `(num_patches,
@@ -116,6 +130,30 @@ class Qwen2VLVideoInputs(TypedDict):
     """
 
 
+class Qwen2VLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+      (concatenation of all videos' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on 
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs,
+                           Qwen2VLVideoEmbeddingInputs]
+
 # === Vision Encoder === #
 
 
@@ -585,6 +623,12 @@ def mm_input_mapper_for_qwen2_vl(
             "image_embeds": data.get("image_embeds"),
             "image_grid_thw": data.get("image_grid_thw"),
         })
+    if data_type_key == "video" and isinstance(data, dict):
+        return MultiModalKwargs({
+            "video_embeds": data.get("video_embeds"),
+            "video_grid_thw": data.get("video_grid_thw"),
+        })
+
     model_config = ctx.model_config
     # Handle mm processor kwargs; we pass these at creation time
     # because preprocess() in transformers doesn't expose them
@@ -890,16 +934,33 @@ def input_processor_for_qwen2_vl(
                 idx for idx, token in enumerate(prompt_token_ids)
                 if token == hf_config.image_token_id
             ]
-            image_cnt = len(image_indices)
-            embed_dim = image_inputs.get('image_embeds').size(0)
-            assert embed_dim % image_cnt == 0
-            num_pad_tokens = embed_dim // image_cnt
+
+            # ensure all image tokens have grid_thw
+            assert \
+                len(image_indices) == image_inputs["image_grid_thw"].size(0), \
+                "image token num does not match image_grid_thw.shape"
+
+            image_counter = 0
+            pad_token_counter = 0
             for idx, token in enumerate(prompt_token_ids):
                 if idx in image_indices:
+                    grid_thw = image_inputs["image_grid_thw"][image_counter]
+                    grid_t, grid_h, grid_w = grid_thw
+                    num_pad_tokens = (grid_t * grid_h * grid_w //
+                                      image_processor.merge_size //
+                                      image_processor.merge_size)
                     prompt_token_ids_with_image.extend([token] *
                                                        num_pad_tokens)
+                    image_counter += 1
+                    pad_token_counter += num_pad_tokens
                 else:
                     prompt_token_ids_with_image.append(token)
+
+            # ensure all embeddings are used
+            assert \
+                pad_token_counter == image_inputs["image_embeds"].size(0), \
+                "image_embeds.shape does not match image_grid_thw"
+
             prompt_token_ids = prompt_token_ids_with_image
         else:
             prompt_token_ids = _expand_pad_tokens(image_inputs,
@@ -912,14 +973,49 @@ def input_processor_for_qwen2_vl(
                                                   max_pixels=max_pixels)
 
     if video_inputs is not None:
-        prompt_token_ids = _expand_pad_tokens(video_inputs,
-                                              hf_config.video_token_id,
-                                              make_batched_videos,
-                                              "video",
-                                              image_processor,
-                                              prompt_token_ids,
-                                              min_pixels=min_pixels,
-                                              max_pixels=max_pixels)
+        if isinstance(video_inputs, dict):
+            prompt_token_ids_with_video = []
+            video_indices = [
+                idx for idx, token in enumerate(prompt_token_ids)
+                if token == hf_config.video_token_id
+            ]
+
+            # ensure all video tokens have grid_thw
+            assert \
+                len(video_indices) == video_inputs["video_grid_thw"].size(0), \
+                "video token num does not match video_grid_thw.shape"
+
+            video_counter = 0
+            pad_token_counter = 0
+            for idx, token in enumerate(prompt_token_ids):
+                if idx in video_indices:
+                    grid_thw = video_inputs["video_grid_thw"][video_counter]
+                    grid_t, grid_h, grid_w = grid_thw
+                    num_pad_tokens = (grid_t * grid_h * grid_w //
+                                      image_processor.merge_size //
+                                      image_processor.merge_size)
+                    prompt_token_ids_with_video.extend([token] *
+                                                       num_pad_tokens)
+                    video_counter += 1
+                    pad_token_counter += num_pad_tokens
+                else:
+                    prompt_token_ids_with_video.append(token)
+
+            # ensure all embeddings are used
+            assert \
+                pad_token_counter == video_inputs["video_embeds"].size(0), \
+                "video_embeds.shape does not match video_grid_thw"
+
+            prompt_token_ids = prompt_token_ids_with_video
+        else:
+            prompt_token_ids = _expand_pad_tokens(video_inputs,
+                                                  hf_config.video_token_id,
+                                                  make_batched_videos,
+                                                  "video",
+                                                  image_processor,
+                                                  prompt_token_ids,
+                                                  min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
 
     prompt = inputs.get("prompt")
     if prompt is None:
@@ -1051,49 +1147,71 @@ def _parse_and_validate_image_input(
                                  f"Got type: {type(pixel_values)}")
 
             return Qwen2VLImagePixelInputs(type="pixel_values",
-                                           data=pixel_values,
+                                           pixel_values=pixel_values,
                                            image_grid_thw=image_grid_thw)
 
         if image_embeds is not None:
             image_embeds = self._validate_and_reshape_mm_tensor(
                 image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
 
             if not isinstance(image_embeds, torch.Tensor):
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
             return Qwen2VLImageEmbeddingInputs(type="image_embeds",
-                                               data=image_embeds)
+                                               image_embeds=image_embeds,
+                                               image_grid_thw=image_grid_thw)
 
     def _parse_and_validate_video_input(
             self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
 
-        if pixel_values_videos is None:
+        if pixel_values_videos is None and video_embeds is None:
             return None
 
-        pixel_values_videos = self._validate_and_reshape_mm_tensor(
-            pixel_values_videos, "video pixel values")
-        video_grid_thw = self._validate_and_reshape_mm_tensor(
-            video_grid_thw, "video grid_thw")
-
-        return Qwen2VLVideoInputs(
-            pixel_values_videos=pixel_values_videos,
-            video_grid_thw=video_grid_thw,
-        )
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
+                                               video_embeds=video_embeds,
+                                               video_grid_thw=video_grid_thw)
 
     def _process_image_input(self,
                              image_input: Qwen2VLImageInputs) -> torch.Tensor:
         if image_input["type"] == "image_embeds":
-            return image_input["data"].type(self.visual.dtype)
+            return image_input["image_embeds"].type(self.visual.dtype)
 
-        pixel_values = image_input["data"].type(self.visual.dtype)
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
         image_embeds = self.visual(pixel_values,
                                    grid_thw=image_input["image_grid_thw"])
         return image_embeds
 
     def _process_video_input(self,
                              video_input: Qwen2VLVideoInputs) -> torch.Tensor:
+        if video_input["type"] == "video_embeds":
+            return video_input["video_embeds"].type(self.visual.dtype)
+
         pixel_values_videos = video_input["pixel_values_videos"].type(
             self.visual.dtype)
         video_embeds = self.visual(pixel_values_videos,

From 1b886aa104248a95720fda7be9f979fc665b3d02 Mon Sep 17 00:00:00 2001
From: Austin Veselka <50646302+FurtherAI@users.noreply.github.com>
Date: Wed, 13 Nov 2024 02:28:13 -0600
Subject: [PATCH 10/39] [Model] Adding Support for Qwen2VL as an Embedding
 Model. Using MrLight/dse-qwen2-2b-mrl-v1 (#9944)

Signed-off-by: FurtherAI <austin.veselka@lighton.ai>
Co-authored-by: FurtherAI <austin.veselka@lighton.ai>
---
 docs/source/models/supported_models.rst       |   6 +
 docs/source/models/vlm.rst                    |  17 ++
 ...ai_chat_embedding_client_for_multimodal.py | 123 +++++++++--
 examples/template_dse_qwen2_vl.jinja          |   7 +
 tests/conftest.py                             |   3 +
 .../vision_language/test_dse_qwen2_vl.py      | 209 ++++++++++++++++++
 vllm/model_executor/models/qwen2_vl.py        |  17 +-
 vllm/model_executor/models/registry.py        |   1 +
 8 files changed, 364 insertions(+), 19 deletions(-)
 create mode 100644 examples/template_dse_qwen2_vl.jinja
 create mode 100644 tests/models/embedding/vision_language/test_dse_qwen2_vl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index ca894819f2c26..58ec3acc6aea5 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -584,6 +584,12 @@ Multimodal Embedding
     - :code:`TIGER-Lab/VLM2Vec-Full`
     - 🚧
     - ✅︎
+  * - :code:`Qwen2VLForConditionalGeneration`
+    - Qwen2-VL-based
+    - T + I
+    - :code:`MrLight/dse-qwen2-2b-mrl-v1`
+    - 
+    - ✅︎
 
 .. important::
   Some model architectures support both generation and embedding tasks.
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 112e9db6a41de..bcbe50a25fa09 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -310,4 +310,21 @@ Since the request schema is not defined by OpenAI client, we post a request to t
     response_json = response.json()
     print("Embedding output:", response_json["data"][0]["embedding"])
 
+Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
+
+.. code-block:: bash
+
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
+      --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
+
+.. important::
+
+    Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, 
+    which is handled by the jinja template.
+
+.. important::
+
+    Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code 
+    example below for details.
+
 A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
index effb588e1387f..fff82020d9a30 100644
--- a/examples/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -1,33 +1,120 @@
+import argparse
+import base64
+import io
+
 import requests
+from PIL import Image
 
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
-response = requests.post(
-    "http://localhost:8000/v1/embeddings",
-    json={
-        "model":
-        "TIGER-Lab/VLM2Vec-Full",
-        "messages": [{
+
+def vlm2vec():
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model":
+            "TIGER-Lab/VLM2Vec-Full",
+            "messages": [{
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "Represent the given image."
+                    },
+                ],
+            }],
+            "encoding_format":
+            "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+
+    print("Embedding output:", response_json["data"][0]["embedding"])
+
+
+def dse_qwen2_vl(inp: dict):
+    # Embedding an Image
+    if inp["dtype"] == "image":
+        messages = [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": inp["image_url"],
+                }
+            }, {
+                "type": "text",
+                "text": "What is shown in this image?"
+            }]
+        }]
+    # Embedding a Text Query
+    else:
+        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
+        # of the minimum input size
+        buffer = io.BytesIO()
+        image_placeholder = Image.new("RGB", (56, 56))
+        image_placeholder.save(buffer, "png")
+        buffer.seek(0)
+        image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
+        messages = [{
             "role":
             "user",
             "content": [
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": image_url
+                        "url": f"data:image/jpeg;base64,{image_placeholder}",
                     }
                 },
                 {
                     "type": "text",
-                    "text": "Represent the given image."
+                    "text": f"Query: {inp['content']}"
                 },
-            ],
-        }],
-        "encoding_format":
-        "float",
-    },
-)
-response.raise_for_status()
-response_json = response.json()
-
-print("Embedding output:", response_json["data"][0]["embedding"])
+            ]
+        }]
+
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "MrLight/dse-qwen2-2b-mrl-v1",
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+
+    print("Embedding output:", response_json["data"][0]["embedding"])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        "Script to call a specified VLM through the API. Make sure to serve "
+        "the model with --task embedding before running this.")
+    parser.add_argument("model",
+                        type=str,
+                        choices=["vlm2vec", "dse_qwen2_vl"],
+                        required=True,
+                        help="Which model to call.")
+    args = parser.parse_args()
+
+    if args.model == "vlm2vec":
+        vlm2vec()
+    elif args.model == "dse_qwen2_vl":
+        dse_qwen2_vl({
+            "dtye": "image",
+            "image_url": image_url,
+        })
+        dse_qwen2_vl({
+            "dtype": "text",
+            "content": "What is the weather like today?",
+        })
diff --git a/examples/template_dse_qwen2_vl.jinja b/examples/template_dse_qwen2_vl.jinja
new file mode 100644
index 0000000000000..e7b93fae31770
--- /dev/null
+++ b/examples/template_dse_qwen2_vl.jinja
@@ -0,0 +1,7 @@
+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
+{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
+{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
+{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
+{% endraw %}{% endif %}<|endoftext|>
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 6cf791dc62ce5..0dc1cc6e83c18 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -243,6 +243,9 @@ def video_assets() -> _VideoAssets:
 class HfRunner:
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        if x is None or isinstance(x, (bool, )):
+            return x
+
         if device is None:
             device = "cpu" if current_platform.is_cpu() else "cuda"
 
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
new file mode 100644
index 0000000000000..3dd8cb729f8a6
--- /dev/null
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -0,0 +1,209 @@
+from functools import partial
+from typing import Callable, Dict, List, Type
+
+import pytest
+import torch
+from PIL import Image
+from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ..utils import check_embeddings_close
+
+HF_TEXT_PROMPTS = [
+    # T -> X
+    (
+        "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501,
+        Image.new("RGB", (56, 56))),
+    # T -> X
+    ("Query: Retrieve an image of this caption: cherry blossom",
+     Image.new("RGB", (56, 56))),
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "What is shown in this image?",
+    "cherry_blossom":
+    "What is shown in this image?"
+})
+
+MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
+
+
+def get_messages(image: Image.Image, text: str, embed_text: bool):
+    # assert False, 'remember to use outer [] as required'
+    if embed_text:
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": Image.new("RGB", (56, 56)),
+                    "resized_height": 1,
+                    "resized_width": 1
+                },  # need a dummy image here for an easier process.
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ]
+        }]
+    else:
+        messages = [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image",
+                "image": image
+            }, {
+                "type": "text",
+                "text": text
+            }]
+        }]
+    return messages
+
+
+def apply_chat_template_and_add_eos(
+    messages: List[Dict],
+    apply_chat_template_fn: Callable,
+):
+    prompt = apply_chat_template_fn(
+        messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
+    return prompt
+
+
+def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
+    return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
+
+
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    input_texts: List[str],
+    input_images: PromptImageInput,
+    embed_texts: List[bool],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    '''SET PYTHONPATH'''
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     task="embedding",
+                     dtype=dtype,
+                     enforce_eager=True,
+                     max_model_len=8192) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        texts = [
+            # this is necessary because vllm_model.encode will not apply any
+            # templating to the prompt, and therefore lacks an image_pad
+            # token unless one is inserted beforehand (the (28,28) image
+            # above is converted to an image pad token by the chat template).
+            apply_chat_template_and_add_eos(
+                get_messages(image, text, False),
+                apply_chat_template_fn=tokenizer.apply_chat_template,
+            ) for text, image in zip(input_texts, input_images)
+            # vllm will replace the pad token with the actual image,
+            # which may be a placeholder image, later.
+        ]
+        vllm_outputs = vllm_model.encode(texts, images=input_images)
+
+    hf_outputs = []
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
+        hf_model.postprocess_inputs = partial(
+            postprocess_inputs,
+            hf_model,
+            cache_position=torch.arange(
+                0,
+                1,  # 1 for batch size
+                requires_grad=False),
+            use_cache=False)
+        for text, image, embed_text in zip(input_texts, input_images,
+                                           embed_texts):
+            # dse requires non-standard input processing
+            # because it needs an image_pad token
+            messages = get_messages(image, text, embed_text)
+            prompt = apply_chat_template_and_add_eos(
+                messages, hf_model.processor.apply_chat_template)
+            inputs = hf_model.get_inputs(
+                prompts=[[prompt]],
+                images=[[image]],
+            )
+            with torch.no_grad():
+                outputs = hf_model.model(
+                    **hf_model.wrap_device(inputs[0],
+                                           device=hf_model.model.device.type),
+                    return_dict=True,
+                    output_hidden_states=True,
+                )
+                pooled_output = torch.nn.functional.normalize(
+                    outputs.hidden_states[-1][0, -1], p=2, dim=-1)
+            hf_outputs.append(pooled_output.tolist())
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, image_placeholder)
+                          for text, image_placeholder in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+    embed_texts = [True] * len(input_texts)
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        embed_texts,
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+    embed_texts = [False] * len(input_texts)
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        embed_texts,
+        model,
+        dtype=dtype,
+    )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1b162e7df8578..9a19ccbca3f1e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -51,6 +51,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import (GPTQConfig,
                                                      GPTQMarlinConfig,
                                                      QuantizationConfig)
@@ -58,12 +59,13 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs)
 from vllm.multimodal.base import MultiModalData
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.processor import cached_get_processor
 
@@ -1067,6 +1069,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
+        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
         assert not cache_config.enable_prefix_caching, \
             "Qwen2-VL currently does not support prefix caching"
@@ -1098,6 +1101,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=True,
+            softmax=False)
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
@@ -1318,6 +1326,13 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 32750602b988c..f172c06c4a26a 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -109,6 +109,7 @@
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501,
 }
 
 _MULTIMODAL_MODELS = {

From b6dde330198848a4a9903c1f0f97c3235fba0ba0 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Wed, 13 Nov 2024 00:29:32 -0800
Subject: [PATCH 11/39] [Core] Flashinfer - Remove advance step size
 restriction (#10282)

---
 csrc/prepare_inputs/advance_step.cu | 66 +++++++++++++++++------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index 46fef79f439fb..bd184ee22682e 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -88,6 +88,7 @@ inline void verify_tensor(std::string const& name, torch::Tensor const& t,
   }
 }
 
+/// each thread processes a block per query
 __global__ void advance_step_flashinfer_kernel(
     int num_threads, int num_seqs, int num_queries, int block_size,
     long* input_tokens_ptr, long const* sampled_token_ids_ptr,
@@ -134,8 +135,10 @@ __global__ void advance_step_flashinfer_indptr_kernel(
     int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
     int* block_table_bound_ptr) {
   int idx = blockIdx.x * num_threads + threadIdx.x;
-
   // Update paged_kv_indptr
+  if (idx == 0) {
+    paged_kv_indptr_ptr[idx] = 0;
+  }
   if (idx < num_queries) {
     int sum = 0;
     for (int i = 0; i <= idx; ++i) {
@@ -146,20 +149,33 @@ __global__ void advance_step_flashinfer_indptr_kernel(
 }
 
 __global__ void advance_step_flashinfer_indices_kernel(
-    int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
-    int64_t const block_tables_stride, int* paged_kv_indices_ptr,
+    int num_seqs, int num_queries, int const* block_tables_ptr,
+    int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
     int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
-  int idx = blockIdx.x * num_threads + threadIdx.x;
-  int row = idx / block_tables_stride;
-  int col = idx % block_tables_stride;
-
-  if (row < num_queries && col < block_table_bound_ptr[row]) {
-    paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
-        block_tables_ptr[row * block_tables_stride + col];
+  // note: max_num_blocks_per_seq = block_tables.stride(0)
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // when cuda graphs are enabled, paged_kv_indptr tensor
+  // has to be updated for the padded queries
+  // tid represents a query# for paged_kv_indptr tensor
+  if (num_queries < tid && tid <= num_seqs) {
+    paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
   }
-  // if cudagraph, fill padded seqs with the last valid seq's indptr
-  if (num_queries < row && row <= num_seqs) {
-    paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
+
+  // each thread processes a block_ptr in block_tables
+  // block_tables shape: [num_queries, max_num_blocks_per_seq]
+  // paged_kv_indices is flattened block_tables.
+  for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
+       idx += (gridDim.x * blockDim.x)) {
+    // block_tables-row = paged_kv_indptr[queryNum]
+    int queryNum = idx / max_num_blocks_per_seq;
+    int col = idx % max_num_blocks_per_seq;
+    if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
+      int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
+      int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
+      paged_kv_indices_ptr[indices_arr_idx] =
+          block_tables_ptr[block_tables_idx];
+    }
   }
 }
 
@@ -247,22 +263,16 @@ void advance_step_flashinfer(
   int threads;
   cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
   cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
-  if (logging) {
-    printf("launching kernel with %d blocks\n", blocks);
-  }
 
-  // TODO(will): support arbitrary block_tables stride
-  if ((blocks * threads) / block_tables.stride(0) < num_queries) {
-    TORCH_CHECK(false,
-                "multi-step: not enough threads to map block_table to"
-                "FlashInfer's paged_kv_indices on GPU. Try reducing the number "
-                "of seqs,",
-                " increasing the block size or take smaller steps.",
-                " num_queries = ", num_queries,
-                " block_tables.stride(0) = ", block_tables.stride(0),
-                " blocks = ", blocks, " max_threads = ", threads);
+  int block_tables_stride = block_tables.stride(0);
+  TORCH_CHECK((blocks * threads > num_queries),
+              "multi-step: not enough threads to map to num_queries = ",
+              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
+              " blocks = ", blocks, " max_threads = ", threads);
+  if (logging) {
+    printf("launching kernels with %d blocks and %d threads\n", blocks,
+           threads);
   }
-
   advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
       threads, num_seqs, num_queries, block_size,
       reinterpret_cast<long*>(input_tokens.data_ptr()),
@@ -281,7 +291,7 @@ void advance_step_flashinfer(
       reinterpret_cast<int*>(block_table_bound.data_ptr()));
 
   advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries,
+      num_seqs, num_queries,
       reinterpret_cast<int const*>(block_tables.data_ptr()),
       block_tables.stride(0),
       reinterpret_cast<int*>(paged_kv_indices.data_ptr()),

From d909acf9fe17b7db42d7de61903c0058c8b9b344 Mon Sep 17 00:00:00 2001
From: B-201 <Joy25810@foxmail.com>
Date: Wed, 13 Nov 2024 17:25:59 +0800
Subject: [PATCH 12/39] [Model][LoRA]LoRA support added for idefics3 (#10281)

Signed-off-by: B-201 <Joy25810@foxmail.com>
---
 docs/source/models/supported_models.rst |  2 +-
 vllm/model_executor/models/idefics3.py  | 55 +++++++++++++++++++++----
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 58ec3acc6aea5..161733c049bbe 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -450,7 +450,7 @@ Text Generation
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
-    - 
+    - ✅︎
     - 
   * - :code:`InternVLChatModel`
     - InternVL2
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 8845b2f58af07..85f23a1da533b 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -33,6 +33,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
@@ -44,7 +45,7 @@
 from .idefics2_vision_model import (
     Idefics2VisionTransformer as Idefics3VisionTransformer)
 # yapf: enable
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -58,8 +59,6 @@ class Idefics3ImagePixelInputs(TypedDict):
     """
     Shape: `(batch_size * num_images, num_channels, height, width)`
     """
-    rows: List[int]
-    cols: List[int]
     pixel_attention_mask: Optional[torch.BoolTensor]
 
 
@@ -356,8 +355,15 @@ def dummy_data_for_idefics3(
     image_seq_len = processor.image_seq_len
     max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images
 
+    if seq_len - max_llm_image_tokens < 0:
+        raise RuntimeError(
+            f"Idefics3 cannot process {num_images} images in a prompt, "
+            "please increase max_model_len or reduce image limit by "
+            "--limit-mm-per-prompt.")
+
     seq_data = SequenceData.from_prompt_token_counts(
-        (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len))
+        (hf_config.image_token_id, max_llm_image_tokens),
+        (0, seq_len - max_llm_image_tokens))
 
     width = height = hf_config.vision_config.image_size
     image = Image.new("RGB", (width, height), color=0)
@@ -463,8 +469,6 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
-        rows = kwargs.pop("rows", None)
-        cols = kwargs.pop("cols", None)
         pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
 
         if pixel_values is None and image_embeds is None:
@@ -489,8 +493,6 @@ def _parse_and_validate_image_input(
                                             data=self._validate_pixel_values(
                                                 flatten_bn(pixel_values,
                                                            concat=True)),
-                                            rows=rows,
-                                            cols=cols,
                                             pixel_attention_mask=flatten_bn(
                                                 pixel_attention_mask,
                                                 concat=True))
@@ -610,7 +612,33 @@ def forward(
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3)
-class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision_model
+        "fc1",
+        "fc2",
+        "out_proj",
+        # text_model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -672,3 +700,12 @@ def sample(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model.text_model",
+            connector="model.connector",
+            tower_model="model.vision_model")

From bb7991aa291054a30f408e626273caa6769a07eb Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 13 Nov 2024 03:02:56 -0800
Subject: [PATCH 13/39] [V1] Add missing tokenizer options for `Detokenizer`
 (#10288)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/detokenizer.py | 11 +++++++++--
 vllm/v1/engine/llm_engine.py  |  7 ++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 1dbf8e75ec478..6249d60199a62 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -192,10 +192,17 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
 
 class Detokenizer:
 
-    def __init__(self, tokenizer_name: str):
+    def __init__(self,
+                 tokenizer_name: str,
+                 tokenizer_mode: str = "auto",
+                 trust_remote_code: bool = False,
+                 revision: Optional[str] = None):
         # TODO: once we support LoRA, we should should pass the tokenizer
         # here. We currently have two copies (this + in the LLMEngine).
-        self.tokenizer = get_tokenizer(tokenizer_name)
+        self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                                       tokenizer_mode=tokenizer_mode,
+                                       trust_remote_code=trust_remote_code,
+                                       revision=revision)
 
         # Request id -> IncrementalDetokenizer
         self.request_states: Dict[str, IncrementalDetokenizer] = {}
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index f37db92e8ea6b..5b45615a1b85b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -53,7 +53,12 @@ def __init__(
                                    input_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
-        self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer)
+        self.detokenizer = Detokenizer(
+            tokenizer_name=vllm_config.model_config.tokenizer,
+            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
+            trust_remote_code=vllm_config.model_config.trust_remote_code,
+            revision=vllm_config.model_config.tokenizer_revision,
+        )
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(

From 78eea7be8f347c6a660008d24b977b72c7034dcb Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Tue, 1 Oct 2024 13:25:33 +0000
Subject: [PATCH 14/39] semi_structured for fp16 and bf16 and int8

---
 tests/kernels/test_semi_structured.py         | 86 +++++++++++++++++++
 .../layers/sparsity/__init__.py               |  0
 .../layers/sparsity/utils/__init__.py         |  0
 .../sparsity/utils/cusparse_2_4_utils.py      | 49 +++++++++++
 4 files changed, 135 insertions(+)
 create mode 100644 tests/kernels/test_semi_structured.py
 create mode 100644 vllm/model_executor/layers/sparsity/__init__.py
 create mode 100644 vllm/model_executor/layers/sparsity/utils/__init__.py
 create mode 100644 vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py

diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..c5ca5bddd0f08
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,86 @@
+import pytest
+import torch
+
+from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
+    generate_pruned_semi_structured_mat,
+    semi_structured_sparse_dense_gemm, 
+    semi_structured_dense_sparse_T_gemm, 
+    compress_to_torch_sparse_semi_structured_mat, 
+    decompress_torch_sparse_semi_structured_mat,
+    get_random_mat,
+    is_semi_structured_supported
+)
+
+from vllm import _custom_ops as ops
+
+DTYPES = [torch.float16, torch.bfloat16, torch.int8]
+SIZES=[(128, 128), (1024, 8192)]
+MNK = [
+    (64, 64, 64),
+    (64, 256, 512),
+    (512, 512, 512),
+    (512, 2048, 4096)
+]
+
+def dense_matmul(A, B, dtype):
+    if dtype is torch.int8:
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b, torch.bfloat16).to(torch.int8)
+    else:
+        return A @ B
+
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("size", SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_semi_structured_compress(size, dtype):
+    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
+    output_pruned = decompress_torch_sparse_semi_structured_mat(
+        compress_to_torch_sparse_semi_structured_mat(input_pruned)
+    )
+    torch.testing.assert_close(input_pruned, output_pruned)
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
+    if dtype is torch.int8:
+        pytest.skip("cusparse does not support sparse x non transposed dense")
+    M, N, K = mnk
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(K, N, dtype)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B)
+    C = dense_matmul(A_pruned, B, dtype)
+    torch.testing.assert_close(C, C_sparse)
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
+    M, N, K = mnk
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(N, K, dtype)
+
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
+    C = dense_matmul(A_pruned, B.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
+    M, N, K = mnk
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype)
+    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
+    A = get_random_mat(M, K, dtype)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T)
+    C = dense_matmul(A, B_T_pruned.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/sparsity/utils/__init__.py b/vllm/model_executor/layers/sparsity/utils/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
new file mode 100644
index 0000000000000..1be6ab4db18b1
--- /dev/null
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -0,0 +1,49 @@
+import torch
+from torch.sparse import to_sparse_semi_structured
+from vllm.platforms import current_platform
+from packaging.version import Version
+
+def compress_to_torch_sparse_semi_structured_mat(mat):
+    return to_sparse_semi_structured(mat)
+
+def decompress_torch_sparse_semi_structured_mat(sp_mat):
+    # Fix of to_dense() function supporting int8
+    # cuSparseLT for int8 requires dense matrix to be non-contiguous
+    return torch.mm(sp_mat, torch.eye(sp_mat.shape[-1], dtype=sp_mat.dtype, device=sp_mat.device).t())
+
+def semi_structured_sparse_dense_gemm(
+    a_sparse: torch.Tensor, b_dense: torch.Tensor
+):
+    return torch.mm(a_sparse, b_dense)
+
+def semi_structured_dense_sparse_T_gemm(
+    a: torch.Tensor, b_T: torch.Tensor
+):
+    return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
+
+def is_semi_structured_supported() -> bool:
+    if not (current_platform.is_cuda() or current_platform.is_rocm()):
+        return False
+
+    base_torch_version = Version(Version(torch.__version__).base_version)
+    
+    capability = current_platform.get_device_capability()
+    assert capability is not None
+    capability = capability.to_int()
+    min_capability = 80
+
+    return capability == min_capability or (capability > min_capability and base_torch_version >= Version("2.5.0"))
+
+def get_random_mat(M, K, dtype):
+    rand_tensor_dtype = dtype
+    if dtype is torch.int8:
+        rand_tensor_dtype = torch.float16
+    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda().to(dtype)
+    return mat
+
+def generate_pruned_semi_structured_mat(M, K, dtype):
+
+    mask = torch.Tensor([0, 0, 1, 1]).tile((M, K // 4)).cuda().bool()
+    mat = get_random_mat(M, K, dtype)
+    mat = mat.masked_fill_(mat == 0, 1)
+    return mat * mask

From 331e9c5a6ffc354e7e0f9cf66e911bb12f4ce2ba Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 2 Oct 2024 12:26:05 -0400
Subject: [PATCH 15/39] Fix A100 int8 tests

---
 tests/kernels/test_semi_structured.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c5ca5bddd0f08..7060a36dd6c8b 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -16,10 +16,10 @@
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES=[(128, 128), (1024, 8192)]
 MNK = [
-    (64, 64, 64),
-    (64, 256, 512),
+    (128, 128, 128),
+    (128, 512, 1024),
     (512, 512, 512),
-    (512, 2048, 4096)
+    (1024, 2048, 4096)
 ]
 
 def dense_matmul(A, B, dtype):

From 381a6b4702cf39b2f9e77c98a73f89f71ef53ae8 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 13:56:35 +0000
Subject: [PATCH 16/39] Add fp8 cusparseLt

---
 CMakeLists.txt                                |   1 +
 csrc/ops.h                                    |   9 +
 .../fp8_semi_structured/cusparseLt.h          | 244 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |  11 +
 tests/kernels/test_semi_structured.py         |  71 ++---
 vllm/_custom_ops.py                           |   8 +
 .../sparsity/utils/cusparse_2_4_utils.py      |  80 ++++--
 7 files changed, 371 insertions(+), 53 deletions(-)
 create mode 100644 csrc/quantization/fp8_semi_structured/cusparseLt.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5acbd762ee957..c0306f0909628 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -198,6 +198,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
+  "csrc/quantization/fp8_semi_structured/cusparseLt.h"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/ops.h b/csrc/ops.h
index 672e608e9c47e..4f142906e4410 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -222,3 +222,12 @@ void register_graph_buffers(fptr_t _fa,
                             const std::vector<std::vector<int64_t>>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
 #endif
+
+#ifndef USE_ROCM
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
+
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const std::optional<torch::Tensor>& bias_opt, bool transpose_result);
+
+#endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
new file mode 100644
index 0000000000000..867705c117074
--- /dev/null
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -0,0 +1,244 @@
+#include <cusparse.h>
+#include <torch/all.h>
+
+#include <cusparseLt.h>
+#include <cuda_fp8.h>
+
+namespace vllm {
+
+
+cusparseLtHandle_t handle;
+bool handle_initialized = false;
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    
+    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // create sparse descriptor, dtype
+    auto compression_factor = 9;
+    cusparseLtMatDescriptor_t input_descriptor;
+    cudaDataType type = CUDA_R_8F_E4M3;
+    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &input_descriptor,
+        input.size(0),
+        input.size(1),
+        input.size(1),
+        16,
+        type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    size_t compressed_size, compressed_buffer_size;
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
+        &handle,
+        &input_descriptor,
+        &compressed_size,
+        &compressed_buffer_size));
+
+    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
+    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
+        &handle,
+        &input_descriptor,
+        true,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        input.data_ptr(),
+        compressed_tensor.data_ptr(),
+        compressedBufferPtr.get(),
+        stream));
+    return compressed_tensor;
+}
+
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A,
+    const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt,
+    bool transpose_result
+)
+{
+    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
+    
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // cusparseLt data structures
+    cusparseLtMatmulDescriptor_t matmul;
+    cusparseLtMatmulPlan_t plan;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    
+    float alpha = 1.0;
+    float beta = 0.0;
+    cudaDataType input_type = CUDA_R_8F_E4M3;
+    cudaDataType output_type;
+    cudaDataType C_type;
+    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+    auto compression_factor = 9;
+    ScalarType out_dtype = dense_B.scalar_type();
+
+    switch (out_dtype)
+    {
+        case at::ScalarType::Float8_e4m3fn:
+            output_type = CUDA_R_8F_E4M3;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::Half:
+            output_type = CUDA_R_16F;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::BFloat16:
+            output_type = CUDA_R_16BF;
+            C_type = CUDA_R_16BF;
+            break;
+        case at::ScalarType::Float:
+            output_type = CUDA_R_32F;
+            C_type = CUDA_R_32F;
+            break;
+        default:
+            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
+            break;
+    }
+
+    int64_t k = dense_B.size(0);
+    int64_t n = dense_B.size(1);
+    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
+
+
+    //initialize sparse descriptor
+    cusparseLtMatDescriptor_t sparse_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &sparse_input_descriptor,
+        m,
+        k,
+        k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    // initialize dense input descriptor
+    cusparseLtMatDescriptor_t dense_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &dense_input_descriptor,
+        (dense_B.is_contiguous()) ? k : n,
+        (dense_B.is_contiguous()) ? n : k,
+        (dense_B.is_contiguous()) ? n : k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW));
+    
+    // create result tensor
+    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
+                                        : at::empty({m, n}, res_tensor_options);
+
+    cusparseLtMatDescriptor_t res_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &res_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        output_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    cusparseLtMatDescriptor_t C_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &C_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        C_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle,
+      &matmul,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
+      &sparse_input_descriptor,
+      &dense_input_descriptor,
+      &C_descriptor,
+      &res_descriptor,
+      compute_type));
+    
+    // set bias pointer for matmul, need to assign to get location
+    if (bias_opt.has_value()) {
+        auto& bias = bias_opt.value();
+        void* dBias = bias.data_ptr();
+        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
+    }
+
+    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
+                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
+    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
+    size_t workspace_size;
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+
+
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    auto workspacePtr = allocator.allocate(workspace_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+        &handle,
+        &plan,
+        &alpha,
+        compressed_A.data_ptr(),
+        dense_B.data_ptr(),
+        &beta,
+        res.data_ptr(),
+        res.data_ptr(),
+        workspacePtr.get(),
+        // jank because of the way we want this to be an array of streams
+        &stream,
+        1));
+
+    // Destroy descriptors
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
+    // Destroy plan
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
+    return res;
+}
+#else
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
+}
+
+at::Tensor cslt_mm_fp8_semi_structured(
+    const Tensor& compressed_A,
+    const Tensor& dense_B,
+    const std::optional<Tensor>& bias_opt,
+    bool transpose_result,
+)
+{
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
+#endif
+}
+
+#endif
+
+
+} // namespace vllm
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 229fd554d3eee..6cc5dd5b01a50 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -322,6 +322,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "bool silu_activation,"
       "int pad_slot_id) -> ()");
   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
+
+  ops.def("cslt_compress_fp8_semi_structured(Tensor! input) -> Tensor");
+  ops.impl("cslt_compress_fp8_semi_structured", torch::kCUDA,
+           &cslt_compress_fp8_semi_structured);
+
+  ops.def(
+      "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
+      "Tensor!? bias, bool transpose_result) -> Tensor");
+
+  ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
+           &cslt_mm_fp8_semi_structured);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 7060a36dd6c8b..b9fd2800f1711 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -1,49 +1,46 @@
 import pytest
 import torch
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    generate_pruned_semi_structured_mat,
-    semi_structured_sparse_dense_gemm, 
-    semi_structured_dense_sparse_T_gemm, 
-    compress_to_torch_sparse_semi_structured_mat, 
+    compress_to_torch_sparse_semi_structured_mat,
     decompress_torch_sparse_semi_structured_mat,
-    get_random_mat,
-    is_semi_structured_supported
-)
+    generate_pruned_semi_structured_mat, get_random_mat,
+    is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
+    semi_structured_sparse_dense_gemm,
+    dense_matmul)
 
-from vllm import _custom_ops as ops
+# DTYPES = [torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn]
+DTYPES = [torch.float8_e4m3fn]
+SIZES = [(128, 128), (1024, 8192)]
+MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
 
-DTYPES = [torch.float16, torch.bfloat16, torch.int8]
-SIZES=[(128, 128), (1024, 8192)]
-MNK = [
-    (128, 128, 128),
-    (128, 512, 1024),
-    (512, 512, 512),
-    (1024, 2048, 4096)
-]
 
-def dense_matmul(A, B, dtype):
-    if dtype is torch.int8:
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b, torch.bfloat16).to(torch.int8)
-    else:
-        return A @ B
-
-
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
+    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
+        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
-        compress_to_torch_sparse_semi_structured_mat(input_pruned)
-    )
+        compress_to_torch_sparse_semi_structured_mat(input_pruned))
     torch.testing.assert_close(input_pruned, output_pruned)
 
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+
+# @pytest.mark.skipif(
+#  not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+#     reason="Semi structured matmul is not supported on this GPU type.")
+# @pytest.mark.parametrize("size", SIZES)
+# def test_torch_fp8_compress(size):
+#     x = generate_pruned_semi_structured_mat(*size, torch.float8_e4m3fn)
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
@@ -57,8 +54,10 @@ def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
     C = dense_matmul(A_pruned, B, dtype)
     torch.testing.assert_close(C, C_sparse)
 
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
@@ -71,8 +70,10 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     C = dense_matmul(A_pruned, B.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
 
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 8f331a27a20de..d06f9bf8ea33f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -706,6 +706,14 @@ def scaled_fp8_quant(
 
     return output, scale
 
+# semi structured fp8
+def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
+    assert input.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_compress_fp8_semi_structured(input)
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+    assert A_compressed.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
 
 # int8
 def scaled_int8_quant(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 1be6ab4db18b1..949d39cce72ec 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -1,49 +1,93 @@
 import torch
+from packaging.version import Version
 from torch.sparse import to_sparse_semi_structured
+
+from vllm._custom_ops import (semi_structured_fp8_compress,
+                              semi_structured_fp8_mm)
 from vllm.platforms import current_platform
-from packaging.version import Version
+from vllm import _custom_ops as ops
+
 
 def compress_to_torch_sparse_semi_structured_mat(mat):
-    return to_sparse_semi_structured(mat)
+    if mat.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_compress(mat)
+    else:
+        return to_sparse_semi_structured(mat)
+
 
 def decompress_torch_sparse_semi_structured_mat(sp_mat):
     # Fix of to_dense() function supporting int8
     # cuSparseLT for int8 requires dense matrix to be non-contiguous
-    return torch.mm(sp_mat, torch.eye(sp_mat.shape[-1], dtype=sp_mat.dtype, device=sp_mat.device).t())
+    if sp_mat.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_mm(sp_mat,
+                                      torch.eye(sp_mat.shape[-1],
+                                                dtype=sp_mat.dtype,
+                                                device=sp_mat.device),
+                                      transpose_result=False)
+    else:
+        return torch.mm(
+            sp_mat,
+            torch.eye(sp_mat.shape[-1],
+                      dtype=sp_mat.dtype,
+                      device=sp_mat.device).t())
+
 
-def semi_structured_sparse_dense_gemm(
-    a_sparse: torch.Tensor, b_dense: torch.Tensor
-):
-    return torch.mm(a_sparse, b_dense)
+def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
+                                      b_dense: torch.Tensor):
+    assert a_sparse.dtype in [
+        torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
+    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
+    if a_sparse.dtype == torch.float8_e4m3fn:
+        semi_structured_fp8_mm(a_sparse, b_dense, transpose_result=False)
+    else:
+        return torch.mm(a_sparse, b_dense)
 
-def semi_structured_dense_sparse_T_gemm(
-    a: torch.Tensor, b_T: torch.Tensor
-):
+
+def semi_structured_dense_sparse_T_gemm(a: torch.Tensor, b_T: torch.Tensor):
     return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
 
+# test utils
+def dense_matmul(A, B, dtype):
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
+                                     torch.bfloat16).to(dtype)
+    else:
+        return A @ B
+
+
 def is_semi_structured_supported() -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False
 
     base_torch_version = Version(Version(torch.__version__).base_version)
-    
+
     capability = current_platform.get_device_capability()
     assert capability is not None
     capability = capability.to_int()
     min_capability = 80
 
-    return capability == min_capability or (capability > min_capability and base_torch_version >= Version("2.5.0"))
+    return capability == min_capability or (
+        capability > min_capability and base_torch_version >= Version("2.5.0"))
+
 
 def get_random_mat(M, K, dtype):
     rand_tensor_dtype = dtype
-    if dtype is torch.int8:
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
         rand_tensor_dtype = torch.float16
-    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda().to(dtype)
-    return mat
+    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
+    mat = mat.masked_fill_(mat == 0, 1)
+    return mat.to(dtype)
 
-def generate_pruned_semi_structured_mat(M, K, dtype):
 
+def generate_pruned_semi_structured_mat(M, K, dtype):
     mask = torch.Tensor([0, 0, 1, 1]).tile((M, K // 4)).cuda().bool()
-    mat = get_random_mat(M, K, dtype)
+    rand_tensor_dtype = dtype
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
+        rand_tensor_dtype = torch.float16
+    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
     mat = mat.masked_fill_(mat == 0, 1)
-    return mat * mask
+    mat = mat * mask
+    # mat = get_random_mat(M, K, dtype)
+    return mat.to(dtype)

From b146a7966723cd60a23476324914a92e5b985cae Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 14:24:38 +0000
Subject: [PATCH 17/39] wip

---
 csrc/quantization/fp8_semi_structured/cusparseLt.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
index 867705c117074..6bd8aef2e943f 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -4,12 +4,11 @@
 #include <cusparseLt.h>
 #include <cuda_fp8.h>
 
-namespace vllm {
-
+// namespace vllm {
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
     
@@ -241,4 +240,4 @@ at::Tensor cslt_mm_fp8_semi_structured(
 #endif
 
 
-} // namespace vllm
\ No newline at end of file
+// } // namespace vllm
\ No newline at end of file

From 0ac01cc3e40958f0056a167634fced92cc647663 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 14:53:00 +0000
Subject: [PATCH 18/39] Fix signatures

---
 csrc/ops.h                                         | 2 +-
 csrc/quantization/fp8_semi_structured/cusparseLt.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 4f142906e4410..781a3518939ad 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -228,6 +228,6 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const std::optional<torch::Tensor>& bias_opt, bool transpose_result);
+    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
index 6bd8aef2e943f..25570a019e913 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -225,10 +225,10 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
     TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
 }
 
-at::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_fp8_semi_structured(
     const Tensor& compressed_A,
     const Tensor& dense_B,
-    const std::optional<Tensor>& bias_opt,
+    const c10::optional<Tensor>& bias_opt,
     bool transpose_result,
 )
 {

From 7472af27de6e97b94514208ced77534caaa50188 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Sun, 13 Oct 2024 20:55:04 +0000
Subject: [PATCH 19/39] Fix compilation and tests

---
 CMakeLists.txt                                |  2 +-
 .../{cusparseLt.h => cusparseLt.cpp}          | 39 ++++++------
 tests/kernels/test_semi_structured.py         | 60 +++++++++++++++----
 vllm/_custom_ops.py                           | 11 +++-
 .../sparsity/utils/cusparse_2_4_utils.py      | 40 +++++++++----
 5 files changed, 107 insertions(+), 45 deletions(-)
 rename csrc/quantization/fp8_semi_structured/{cusparseLt.h => cusparseLt.cpp} (88%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0306f0909628..6be55e5e38f1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -198,7 +198,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
-  "csrc/quantization/fp8_semi_structured/cusparseLt.h"
+  "csrc/quantization/fp8_semi_structured/cusparseLt.cpp"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
similarity index 88%
rename from csrc/quantization/fp8_semi_structured/cusparseLt.h
rename to csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 25570a019e913..5437dbb3ae2e4 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -1,14 +1,25 @@
 #include <cusparse.h>
 #include <torch/all.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/cuda/CUDAContext.h>
 
 #include <cusparseLt.h>
-#include <cuda_fp8.h>
 
-// namespace vllm {
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 600
+
+#define CUDASPARSE_CHECK(EXPR)                                  \
+  do {                                                          \
+    cusparseStatus_t __err = EXPR;                              \
+    TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,               \
+                "CUDA error: ",                                 \
+                cusparseGetErrorString(__err),                  \
+                " when calling `" #EXPR "`");                   \
+  } while (0)
+
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
-#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
     
@@ -41,7 +52,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
         &compressed_size,
         &compressed_buffer_size));
 
-    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
     auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -82,7 +93,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     cudaDataType C_type;
     cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
     auto compression_factor = 9;
-    ScalarType out_dtype = dense_B.scalar_type();
+    auto out_dtype = dense_B.scalar_type();
 
     switch (out_dtype)
     {
@@ -191,7 +202,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
 
 
-    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
     auto workspacePtr = allocator.allocate(workspace_size);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -205,7 +216,6 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         res.data_ptr(),
         res.data_ptr(),
         workspacePtr.get(),
-        // jank because of the way we want this to be an array of streams
         &stream,
         1));
 
@@ -219,6 +229,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
     return res;
 }
+
 #else
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
@@ -226,18 +237,12 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured(
-    const Tensor& compressed_A,
-    const Tensor& dense_B,
-    const c10::optional<Tensor>& bias_opt,
+    const torch::Tensor& compressed_A,
+    const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt,
     bool transpose_result,
-)
-{
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+) {
     TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
-#endif
 }
 
 #endif
-
-
-// } // namespace vllm
\ No newline at end of file
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index b9fd2800f1711..216cb4a547d3c 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -4,15 +4,14 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat,
-    decompress_torch_sparse_semi_structured_mat,
+    decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
-    semi_structured_sparse_dense_gemm,
-    dense_matmul)
+    semi_structured_sparse_dense_gemm)
 
-# DTYPES = [torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn]
-DTYPES = [torch.float8_e4m3fn]
+DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
+SIZES_FP8 = [(32, 64), (1024, 1024)]
 MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
 
 
@@ -22,20 +21,25 @@
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
-    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
-        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
         compress_to_torch_sparse_semi_structured_mat(input_pruned))
     torch.testing.assert_close(input_pruned, output_pruned)
 
 
-# @pytest.mark.skipif(
-#  not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
-#     reason="Semi structured matmul is not supported on this GPU type.")
-# @pytest.mark.parametrize("size", SIZES)
-# def test_torch_fp8_compress(size):
-#     x = generate_pruned_semi_structured_mat(*size, torch.float8_e4m3fn)
+@pytest.mark.skipif(
+    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("size", SIZES_FP8)
+def test_semi_structured_fp8_compress(size):
+    dtype = torch.float8_e4m3fn
+    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
+    output_pruned = decompress_torch_sparse_semi_structured_mat(
+        compress_to_torch_sparse_semi_structured_mat(input_pruned))
+    torch.testing.assert_close(input_pruned.to(torch.float32),
+                               output_pruned.to(torch.float32),
+                               rtol=1e-1,
+                               atol=1e-1)
 
 
 @pytest.mark.skipif(
@@ -71,6 +75,21 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     torch.testing.assert_close(C, C_sparse)
 
 
+@pytest.mark.skipif(
+    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
     reason="Semi structured matmul is not supported on this GPU type.")
@@ -85,3 +104,18 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
     C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T)
     C = dense_matmul(A, B_T_pruned.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=dtype)
+    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
+    A = torch.full((M, K), .25, device='cuda', dtype=dtype)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d06f9bf8ea33f..b098330f1dfa8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -706,14 +706,21 @@ def scaled_fp8_quant(
 
     return output, scale
 
+
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     assert input.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor,
+                           B_dense: torch.Tensor,
+                           bias: Optional[torch.Tensor] = None,
+                           transpose_result: bool = False) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
+                                                    bias, transpose_result)
+
 
 # int8
 def scaled_int8_quant(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 949d39cce72ec..201814b4f0401 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -1,30 +1,44 @@
 import torch
 from packaging.version import Version
-from torch.sparse import to_sparse_semi_structured
+from torch.sparse import (SparseSemiStructuredTensor,
+                          SparseSemiStructuredTensorCUSPARSELT,
+                          to_sparse_semi_structured)
 
+from vllm import _custom_ops as ops
 from vllm._custom_ops import (semi_structured_fp8_compress,
                               semi_structured_fp8_mm)
 from vllm.platforms import current_platform
-from vllm import _custom_ops as ops
 
 
-def compress_to_torch_sparse_semi_structured_mat(mat):
-    if mat.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_compress(mat)
+def compress_to_torch_sparse_semi_structured_mat(original_tensor):
+    if original_tensor.dtype == torch.float8_e4m3fn:
+        packed = semi_structured_fp8_compress(original_tensor)
+        return SparseSemiStructuredTensorCUSPARSELT(
+            shape=original_tensor.shape,
+            packed=packed,
+            meta=None,
+            packed_t=None,
+            meta_t=None,
+            compressed_swizzled_bitmask=None,
+            fuse_transpose_cusparselt=SparseSemiStructuredTensor.
+            _FUSE_TRANSPOSE,
+            alg_id_cusparselt=SparseSemiStructuredTensor._DEFAULT_ALG_ID,
+            requires_grad=original_tensor.requires_grad,
+        )
     else:
-        return to_sparse_semi_structured(mat)
+        return to_sparse_semi_structured(original_tensor)
 
 
 def decompress_torch_sparse_semi_structured_mat(sp_mat):
-    # Fix of to_dense() function supporting int8
-    # cuSparseLT for int8 requires dense matrix to be non-contiguous
     if sp_mat.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(sp_mat,
+        return semi_structured_fp8_mm(sp_mat.packed,
                                       torch.eye(sp_mat.shape[-1],
                                                 dtype=sp_mat.dtype,
-                                                device=sp_mat.device),
+                                                device=sp_mat.device).t(),
                                       transpose_result=False)
     else:
+        # Fix of to_dense() function supporting int8
+        # cuSparseLT for int8 requires dense matrix to be non-contiguous
         return torch.mm(
             sp_mat,
             torch.eye(sp_mat.shape[-1],
@@ -38,7 +52,9 @@ def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
     if a_sparse.dtype == torch.float8_e4m3fn:
-        semi_structured_fp8_mm(a_sparse, b_dense, transpose_result=False)
+        return semi_structured_fp8_mm(a_sparse.packed,
+                                      b_dense,
+                                      transpose_result=False)
     else:
         return torch.mm(a_sparse, b_dense)
 
@@ -46,6 +62,7 @@ def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
 def semi_structured_dense_sparse_T_gemm(a: torch.Tensor, b_T: torch.Tensor):
     return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
 
+
 # test utils
 def dense_matmul(A, B, dtype):
     if dtype in [torch.int8, torch.float8_e4m3fn]:
@@ -89,5 +106,4 @@ def generate_pruned_semi_structured_mat(M, K, dtype):
     mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
     mat = mat.masked_fill_(mat == 0, 1)
     mat = mat * mask
-    # mat = get_random_mat(M, K, dtype)
     return mat.to(dtype)

From 3fe8bd4567f7669c2c453d0f5a071111893625ba Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Tue, 15 Oct 2024 09:31:17 -0400
Subject: [PATCH 20/39] Update for older platforms

---
 CMakeLists.txt                                |   6 +
 .../fp8_semi_structured/cusparseLt.cpp        | 410 ++++++++----------
 tests/kernels/test_semi_structured.py         |   9 +-
 .../model_executor/layers/quantization/fp8.py |  66 +++
 4 files changed, 262 insertions(+), 229 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6be55e5e38f1b..e2baa980bca0f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -401,6 +401,12 @@ define_gpu_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
+# If cuSparseLt is not installed we skip 2:4 optimizations
+CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT)
+message(STATUS "Result of include cusparseLt ${HAVE_CUSPARSELT}")
+if(HAVE_CUSPARSELT)
+  target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+endif()
 #
 # _moe_C extension
 #
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 5437dbb3ae2e4..c7e0e9de703c3 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -3,246 +3,204 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#include <cusparseLt.h>
+#define STUB_FUNC_IMPL()                                                     \
+torch::Tensor cslt_compress_fp8_semi_structured(                           \
+    const torch::Tensor& input) {                                          \
+    TORCH_CHECK(false,                                                       \
+                "Unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+}                                                                          \
+                                                                            \
+torch::Tensor cslt_mm_fp8_semi_structured(                                 \
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
+    TORCH_CHECK(false,                                                       \
+                "Unsupported dtype for compressed matrix multiplication in " \
+                "current version of cuSPARSELt.");                           \
+}
+
 
-#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 600
+#if defined(VLLM_CUSPARSELT_ENABLED)
 
-#define CUDASPARSE_CHECK(EXPR)                                  \
-  do {                                                          \
-    cusparseStatus_t __err = EXPR;                              \
-    TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,               \
-                "CUDA error: ",                                 \
-                cusparseGetErrorString(__err),                  \
-                " when calling `" #EXPR "`");                   \
-  } while (0)
+  #include <cusparseLt.h>
 
+  #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 600
+
+    #define CUDASPARSE_CHECK(EXPR)                                 \
+      do {                                                         \
+        cusparseStatus_t __err = EXPR;                             \
+        TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,              \
+                    "CUDA error: ", cusparseGetErrorString(__err), \
+                    " when calling `" #EXPR "`");                  \
+      } while (0)
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
 
-
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    
-    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // create sparse descriptor, dtype
-    auto compression_factor = 9;
-    cusparseLtMatDescriptor_t input_descriptor;
-    cudaDataType type = CUDA_R_8F_E4M3;
-    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &input_descriptor,
-        input.size(0),
-        input.size(1),
-        input.size(1),
-        16,
-        type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    size_t compressed_size, compressed_buffer_size;
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-        &handle,
-        &input_descriptor,
-        &compressed_size,
-        &compressed_buffer_size));
-
-    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
-        &handle,
-        &input_descriptor,
-        true,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        input.data_ptr(),
-        compressed_tensor.data_ptr(),
-        compressedBufferPtr.get(),
-        stream));
-    return compressed_tensor;
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
+              "Only float8 e4m3 is supported in vllm:cslt_compress")
+  if (!handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+    handle_initialized = true;
+  }
+  // create sparse descriptor, dtype
+  auto compression_factor = 9;
+  cusparseLtMatDescriptor_t input_descriptor;
+  cudaDataType type = CUDA_R_8F_E4M3;
+  auto compressed_tensor =
+      input.new_empty(input.numel() * compression_factor / 16);
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
+      16, type, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  size_t compressed_size, compressed_buffer_size;
+  TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
+      &handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
+
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+  auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
+      &handle, &input_descriptor, true, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      input.data_ptr(), compressed_tensor.data_ptr(), compressedBufferPtr.get(),
+      stream));
+  return compressed_tensor;
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured(
-    const torch::Tensor& compressed_A,
-    const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& bias_opt,
-    bool transpose_result
-)
-{
-    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
-    
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // cusparseLt data structures
-    cusparseLtMatmulDescriptor_t matmul;
-    cusparseLtMatmulPlan_t plan;
-    cusparseLtMatmulAlgSelection_t alg_sel;
-    
-    float alpha = 1.0;
-    float beta = 0.0;
-    cudaDataType input_type = CUDA_R_8F_E4M3;
-    cudaDataType output_type;
-    cudaDataType C_type;
-    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-    auto compression_factor = 9;
-    auto out_dtype = dense_B.scalar_type();
-
-    switch (out_dtype)
-    {
-        case at::ScalarType::Float8_e4m3fn:
-            output_type = CUDA_R_8F_E4M3;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::Half:
-            output_type = CUDA_R_16F;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::BFloat16:
-            output_type = CUDA_R_16BF;
-            C_type = CUDA_R_16BF;
-            break;
-        case at::ScalarType::Float:
-            output_type = CUDA_R_32F;
-            C_type = CUDA_R_32F;
-            break;
-        default:
-            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
-            break;
-    }
-
-    int64_t k = dense_B.size(0);
-    int64_t n = dense_B.size(1);
-    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
-
-
-    //initialize sparse descriptor
-    cusparseLtMatDescriptor_t sparse_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &sparse_input_descriptor,
-        m,
-        k,
-        k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    // initialize dense input descriptor
-    cusparseLtMatDescriptor_t dense_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &dense_input_descriptor,
-        (dense_B.is_contiguous()) ? k : n,
-        (dense_B.is_contiguous()) ? n : k,
-        (dense_B.is_contiguous()) ? n : k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW));
-    
-    // create result tensor
-    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
-                                        : at::empty({m, n}, res_tensor_options);
-
-    cusparseLtMatDescriptor_t res_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &res_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        output_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    cusparseLtMatDescriptor_t C_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &C_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        C_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &handle,
-      &matmul,
-      CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
-      &sparse_input_descriptor,
-      &dense_input_descriptor,
-      &C_descriptor,
-      &res_descriptor,
-      compute_type));
-    
-    // set bias pointer for matmul, need to assign to get location
-    if (bias_opt.has_value()) {
-        auto& bias = bias_opt.value();
-        void* dBias = bias.data_ptr();
-        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
-    }
-
-    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
-                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
-    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
-    size_t workspace_size;
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
-
-
-    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-    auto workspacePtr = allocator.allocate(workspace_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-        &handle,
-        &plan,
-        &alpha,
-        compressed_A.data_ptr(),
-        dense_B.data_ptr(),
-        &beta,
-        res.data_ptr(),
-        res.data_ptr(),
-        workspacePtr.get(),
-        &stream,
-        1));
-
-    // Destroy descriptors
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
-    // Destroy plan
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
-    return res;
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
+  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
+              "Only float8 e4m3 is supported in vllm:cslt_compress");
+
+  if (!handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+    handle_initialized = true;
+  }
+  // cusparseLt data structures
+  cusparseLtMatmulDescriptor_t matmul;
+  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatmulAlgSelection_t alg_sel;
+
+  float alpha = 1.0;
+  float beta = 0.0;
+  cudaDataType input_type = CUDA_R_8F_E4M3;
+  cudaDataType output_type;
+  cudaDataType C_type;
+  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  auto compression_factor = 9;
+  auto out_dtype = dense_B.scalar_type();
+
+  switch (out_dtype) {
+    case at::ScalarType::Float8_e4m3fn:
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::Half:
+      output_type = CUDA_R_16F;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::BFloat16:
+      output_type = CUDA_R_16BF;
+      C_type = CUDA_R_16BF;
+      break;
+    case at::ScalarType::Float:
+      output_type = CUDA_R_32F;
+      C_type = CUDA_R_32F;
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
+                  "float32} for fp8 inputs");
+      break;
+  }
+
+  int64_t k = dense_B.size(0);
+  int64_t n = dense_B.size(1);
+  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+
+  // initialize sparse descriptor
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &handle, &sparse_input_descriptor, m, k, k, 16, input_type,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  // initialize dense input descriptor
+  cusparseLtMatDescriptor_t dense_input_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, &dense_input_descriptor, (dense_B.is_contiguous()) ? k : n,
+      (dense_B.is_contiguous()) ? n : k, (dense_B.is_contiguous()) ? n : k, 16,
+      input_type, CUSPARSE_ORDER_ROW));
+
+  // create result tensor
+  auto res_tensor_options =
+      c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+  at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
+                                      : at::empty({m, n}, res_tensor_options);
+
+  cusparseLtMatDescriptor_t res_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      output_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+  cusparseLtMatDescriptor_t C_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
+                                : CUSPARSE_OPERATION_TRANSPOSE,
+      &sparse_input_descriptor, &dense_input_descriptor, &C_descriptor,
+      &res_descriptor, compute_type));
+
+  // set bias pointer for matmul, need to assign to get location
+  if (bias_opt.has_value()) {
+    auto& bias = bias_opt.value();
+    void* dBias = bias.data_ptr();
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+        &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
+        sizeof(dBias)));
+  }
+
+  cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
+                                   CUSPARSELT_MATMUL_ALG_DEFAULT);
+  cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
+  size_t workspace_size;
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+  auto workspacePtr = allocator.allocate(workspace_size);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+      &handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
+      &beta, res.data_ptr(), res.data_ptr(), workspacePtr.get(), &stream, 1));
+
+  // Destroy descriptors
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
+  // Destroy plan
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
+  return res;
 }
-
 #else
 
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
-}
+STUB_FUNC_IMPL()
 
-torch::Tensor cslt_mm_fp8_semi_structured(
-    const torch::Tensor& compressed_A,
-    const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& bias_opt,
-    bool transpose_result,
-) {
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
-}
+#endif
+
+#else
+
+STUB_FUNC_IMPL()
 
 #endif
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 216cb4a547d3c..f14e959a8ad29 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -27,8 +27,9 @@ def test_semi_structured_compress(size, dtype):
     torch.testing.assert_close(input_pruned, output_pruned)
 
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("size", SIZES_FP8)
 def test_semi_structured_fp8_compress(size):
@@ -75,8 +76,9 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     torch.testing.assert_close(C, C_sparse)
 
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     M, N, K = (32, 64, 32)
@@ -106,8 +108,9 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
     torch.testing.assert_close(C, C_sparse)
 
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
     M, N, K = (32, 64, 32)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 978e727bc7cb3..d01d0be9306af 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -92,6 +92,72 @@ def get_quant_method(self, layer: torch.nn.Module,
             return Fp8KVCacheMethod(self)
         return None
 
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+class Fp8Config(QuantizationConfig):
+    """Config class for FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
+    ) -> None:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning("Detected fp8 checkpoint. Please note that the "
+                           "format is experimental and subject to change.")
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+                   activation_scheme=activation_scheme,
+                   ignored_layers=ignored_layers)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return Fp8MoEMethod(self)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.

From e7360275a90220676d6e9fea1cc273409374db1d Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 16 Oct 2024 11:48:36 +0000
Subject: [PATCH 21/39] Add benchmarks

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 227 ++++++++++++++++++
 .../cusparseLt_benchmarks/weight_shapes.py    |  43 ++++
 2 files changed, 270 insertions(+)
 create mode 100644 benchmarks/cusparseLt_benchmarks/benchmark_24.py
 create mode 100644 benchmarks/cusparseLt_benchmarks/weight_shapes.py

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
new file mode 100644
index 0000000000000..11aaf3ce6e03b
--- /dev/null
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -0,0 +1,227 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
+    compress_to_torch_sparse_semi_structured_mat,
+    dense_matmul, get_random_mat,
+    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = get_random_mat(m, k, dtype)
+    b = get_random_mat(n, k, dtype).t()
+    return a, b
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench(m: int, k: int, n: int, label: str,
+              sub_label: str, use_fp8: bool) -> Iterable[TMeasurement]:
+    a, b = make_rand_tensors(torch.float16, m, n, k)
+
+    timers = []
+    # pytorch float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_matmul", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # pytorch bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+ 
+    # cusparseLt fp16
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_fp16_fp16_2_4", semi_structured_sparse_dense_gemm, 
+                 compress_to_torch_sparse_semi_structured_mat(a), b)
+    )
+
+    # cusparseLt bf16
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_bf16_bf16_2_4", semi_structured_sparse_dense_gemm, 
+                 compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16))
+    )
+
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    # cutlass i8
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
+                 dense_matmul, a, b, torch.int8))
+    
+    # cusparseLt i8
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_i8_i8_2_4", semi_structured_sparse_dense_gemm, 
+                 compress_to_torch_sparse_semi_structured_mat(a), b)
+    )
+
+    if use_fp8:
+        a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+        # cutlass fp8
+        timers.append(
+            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
+                    dense_matmul, a, b, torch.float8_e4m3fn))
+        
+        # cusparseLt fp8
+        timers.append(
+            bench_fn(label,
+                    sub_label,
+                    "cusparseLt_fp8_fp8_2_4", semi_structured_sparse_dense_gemm, 
+                    compress_to_torch_sparse_semi_structured_mat(a), b)
+        )
+
+    return timers
+
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(MKNs: Iterable[Tuple[int, int, int]], use_fp8: bool) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(m, k, n, f"gemm", f"MKN=({m}x{k}x{n})", use_fp8)
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+def run_model_bench(args):
+    if not is_semi_structured_supported():
+        raise ValueError("Device does not support semi-structured sparsity")
+
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            assert m % 32 == 0, "Batch size has to be a multiple of 32"
+            for k, n in KNs:
+                if k % 32 or n % 32:
+                    continue
+                MKNs.append((m, k, n))
+
+        data = run(MKNs, args.use_fp8)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark cuSparseLt 2:4 GEMMs.
+
+    To run dimensions from a model:
+        python3 ./benchmarks/cusparseLt_benchmarks/benchmark_24.py --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cusparseLt implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    parser.add_argument('--use-fp8',
+                        action='store_true',
+                        help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
+
+    args = parser.parse_args()
+    run_model_bench(args)
+
+    
\ No newline at end of file
diff --git a/benchmarks/cusparseLt_benchmarks/weight_shapes.py b/benchmarks/cusparseLt_benchmarks/weight_shapes.py
new file mode 100644
index 0000000000000..25ec9d6028627
--- /dev/null
+++ b/benchmarks/cusparseLt_benchmarks/weight_shapes.py
@@ -0,0 +1,43 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}

From ae66f77dcfd1a7984e7fec2254ec6762089545aa Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 23 Oct 2024 16:52:38 +0000
Subject: [PATCH 22/39] Fix typo

---
 .../model_executor/layers/quantization/fp8.py | 63 -------------------
 1 file changed, 63 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d01d0be9306af..d34579b7099bb 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -95,69 +95,6 @@ def get_quant_method(self, layer: torch.nn.Module,
     def get_scaled_act_names(self) -> List[str]:
         return []
 
-class Fp8Config(QuantizationConfig):
-    """Config class for FP8."""
-
-    def __init__(
-        self,
-        is_checkpoint_fp8_serialized: bool = False,
-        activation_scheme: str = "dynamic",
-        ignored_layers: Optional[List[str]] = None,
-    ) -> None:
-        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
-        if is_checkpoint_fp8_serialized:
-            logger.warning("Detected fp8 checkpoint. Please note that the "
-                           "format is experimental and subject to change.")
-        if activation_scheme not in ACTIVATION_SCHEMES:
-            raise ValueError(
-                f"Unsupported activation scheme {activation_scheme}")
-        self.activation_scheme = activation_scheme
-        self.ignored_layers = ignored_layers or []
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "fp8"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16, torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 80
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
-        quant_method = cls.get_from_keys(config, ["quant_method"])
-        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
-        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
-        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
-                   activation_scheme=activation_scheme,
-                   ignored_layers=ignored_layers)
-
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
-        if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix, self.ignored_layers):
-                return UnquantizedLinearMethod()
-            return Fp8LinearMethod(self)
-        elif isinstance(layer, FusedMoE):
-            return Fp8MoEMethod(self)
-        elif isinstance(layer, Attention):
-            return Fp8KVCacheMethod(self)
-        return None
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.

From 59ee24db53b6404c063f20e73853ae67cf9f6d1d Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Thu, 24 Oct 2024 12:19:06 +0000
Subject: [PATCH 23/39] Added scaled_mm for fp8.

Removed cmake check for cusparseLt, needs to be reverted when the cmake issue is resolved.
---
 CMakeLists.txt                                | 15 ++-
 .../cusparseLt_benchmarks/benchmark_24.py     | 96 +++++++++----------
 csrc/ops.h                                    |  1 +
 .../fp8_semi_structured/cusparseLt.cpp        | 41 +++++---
 csrc/torch_bindings.cpp                       |  2 +-
 tests/kernels/test_semi_structured.py         | 52 +++++++++-
 vllm/_custom_ops.py                           |  4 +-
 .../sparsity/utils/cusparse_2_4_utils.py      | 28 +++++-
 8 files changed, 161 insertions(+), 78 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2baa980bca0f..c224c1377769b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -402,11 +402,16 @@ define_gpu_extension_target(
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
 # If cuSparseLt is not installed we skip 2:4 optimizations
-CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT)
-message(STATUS "Result of include cusparseLt ${HAVE_CUSPARSELT}")
-if(HAVE_CUSPARSELT)
-  target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
-endif()
+CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT_H)
+
+# TODO has to be fixed.
+target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+
+# if(HAVE_CUSPARSELT_H)
+#   message(STATUS "cusparseLt found")
+#   target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+# endif()
+
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 11aaf3ce6e03b..aa3328b0f17cf 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -10,17 +10,16 @@
 from torch.utils.benchmark import Measurement as TMeasurement
 from weight_shapes import WEIGHT_SHAPES
 
-from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    compress_to_torch_sparse_semi_structured_mat,
-    dense_matmul, get_random_mat,
+    compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
     is_semi_structured_supported, semi_structured_sparse_dense_gemm)
+from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
+
 # helpers
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                       k: int) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -28,6 +27,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
     b = get_random_mat(n, k, dtype).t()
     return a, b
 
+
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
              **kwargs) -> TMeasurement:
@@ -47,82 +47,75 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
     ).blocked_autorange(min_run_time=min_run_time)
 
 
-def bench(m: int, k: int, n: int, label: str,
-              sub_label: str, use_fp8: bool) -> Iterable[TMeasurement]:
+def bench(m: int, k: int, n: int, label: str, sub_label: str,
+          use_fp8: bool) -> Iterable[TMeasurement]:
     a, b = make_rand_tensors(torch.float16, m, n, k)
 
     timers = []
     # pytorch float16
     timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_matmul", torch.mm,
+        bench_fn(label, sub_label, "pytorch_fp16_fp16_matmul", torch.mm,
                  a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
 
     # pytorch bf16
     timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul", torch.mm,
+                 a.to(dtype=torch.bfloat16, device="cuda"),
                  b.to(dtype=torch.bfloat16, device="cuda")))
- 
+
     # cusparseLt fp16
     timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_fp16_fp16_2_4", semi_structured_sparse_dense_gemm, 
-                 compress_to_torch_sparse_semi_structured_mat(a), b)
-    )
+        bench_fn(label, sub_label, "cusparseLt_fp16_fp16_2_4",
+                 semi_structured_sparse_dense_gemm,
+                 compress_to_torch_sparse_semi_structured_mat(a), b))
 
     # cusparseLt bf16
     timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_bf16_bf16_2_4", semi_structured_sparse_dense_gemm, 
-                 compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16))
-    )
+        bench_fn(
+            label, sub_label, "cusparseLt_bf16_bf16_2_4",
+            semi_structured_sparse_dense_gemm,
+            compress_to_torch_sparse_semi_structured_mat(
+                a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16)))
 
     a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
                  dense_matmul, a, b, torch.int8))
-    
+
     # cusparseLt i8
     timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_i8_i8_2_4", semi_structured_sparse_dense_gemm, 
-                 compress_to_torch_sparse_semi_structured_mat(a), b)
-    )
+        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
+                 semi_structured_sparse_dense_gemm,
+                 compress_to_torch_sparse_semi_structured_mat(a), b))
 
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
         timers.append(
             bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
-                    dense_matmul, a, b, torch.float8_e4m3fn))
-        
+                     dense_matmul, a, b, torch.float8_e4m3fn))
+
         # cusparseLt fp8
         timers.append(
-            bench_fn(label,
-                    sub_label,
-                    "cusparseLt_fp8_fp8_2_4", semi_structured_sparse_dense_gemm, 
-                    compress_to_torch_sparse_semi_structured_mat(a), b)
-        )
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+                     semi_structured_sparse_dense_gemm,
+                     compress_to_torch_sparse_semi_structured_mat(a), b))
 
     return timers
 
 
-
 # runner
 def print_timers(timers: Iterable[TMeasurement]):
     compare = TBenchmark.Compare(timers)
     compare.print()
 
 
-def run(MKNs: Iterable[Tuple[int, int, int]], use_fp8: bool) -> Iterable[TMeasurement]:
+def run(MKNs: Iterable[Tuple[int, int, int]],
+        use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
-        timers = bench(m, k, n, f"gemm", f"MKN=({m}x{k}x{n})", use_fp8)
+        timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
         results.extend(timers)
 
@@ -205,23 +198,22 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         formatter_class=argparse.RawTextHelpFormatter)
 
     parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
+                        nargs="+",
+                        type=str,
+                        default=DEFAULT_MODELS,
+                        choices=WEIGHT_SHAPES.keys())
     parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_TP_SIZES)
     parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    parser.add_argument('--use-fp8',
-                        action='store_true',
-                        help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_BATCH_SIZES)
+    parser.add_argument(
+        '--use-fp8',
+        action='store_true',
+        help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
 
     args = parser.parse_args()
     run_model_bench(args)
-
-    
\ No newline at end of file
diff --git a/csrc/ops.h b/csrc/ops.h
index 781a3518939ad..8943282953db3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -228,6 +228,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index c7e0e9de703c3..a5e8d0ca6c0bf 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -4,21 +4,20 @@
 #include <ATen/cuda/CUDAContext.h>
 
 #define STUB_FUNC_IMPL()                                                     \
-torch::Tensor cslt_compress_fp8_semi_structured(                           \
-    const torch::Tensor& input) {                                          \
+  torch::Tensor cslt_compress_fp8_semi_structured(                           \
+      const torch::Tensor& input) {                                          \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix in current "        \
                 "version of cuSPARSELt.");                                   \
-}                                                                          \
-                                                                            \
-torch::Tensor cslt_mm_fp8_semi_structured(                                 \
-    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
+  }                                                                          \
+                                                                             \
+  torch::Tensor cslt_mm_fp8_semi_structured(                                 \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+      const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
-}
-
+  }
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -72,6 +71,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
@@ -85,6 +85,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
+  int tensor_alpha_mode = 0;
   float alpha = 1.0;
   float beta = 0.0;
   cudaDataType input_type = CUDA_R_8F_E4M3;
@@ -168,8 +169,24 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         sizeof(dBias)));
   }
 
+  const auto alpha_tensor =
+      alpha_opt.has_value() ? *alpha_opt : torch::Tensor{};
+  auto alpha_ptr = &alpha;
+  if (alpha_opt.has_value()) {
+    if (alpha_tensor.numel() == 1) {
+      alpha = alpha_tensor.item<float>();
+    } else {
+      tensor_alpha_mode = 1;
+      TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+          &handle, &matmul, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING,
+          &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
+      alpha_ptr = static_cast<float*>(alpha_tensor.data_ptr());
+    }
+  }
+
   cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
+
   cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
@@ -180,7 +197,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-      &handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
+      &handle, &plan, alpha_ptr, compressed_A.data_ptr(), dense_B.data_ptr(),
       &beta, res.data_ptr(), res.data_ptr(), workspacePtr.get(), &stream, 1));
 
   // Destroy descriptors
@@ -193,11 +210,11 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
   return res;
 }
-#else
+  #else
 
 STUB_FUNC_IMPL()
 
-#endif
+  #endif
 
 #else
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 6cc5dd5b01a50..9c2520c7fcd23 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -329,7 +329,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "Tensor!? bias, bool transpose_result) -> Tensor");
+      "Tensor!? alpha, Tensor!? bias, bool transpose_result) -> Tensor");
 
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index f14e959a8ad29..0b54006e8997d 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -7,7 +7,8 @@
     decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
-    semi_structured_sparse_dense_gemm)
+    semi_structured_sparse_dense_gemm,
+    semi_structured_sparse_dense_gemm_scaled)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -15,6 +16,20 @@
 MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
 
 
+# From pytorch test
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    # Calculate the scale as dtype max divided by absmax
+    scale = finfo.max / x.abs().max().clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
     reason="Semi structured matmul is not supported on this GPU type.")
@@ -29,7 +44,8 @@ def test_semi_structured_compress(size, dtype):
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("size", SIZES_FP8)
 def test_semi_structured_fp8_compress(size):
@@ -78,7 +94,8 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     M, N, K = (32, 64, 32)
@@ -92,6 +109,32 @@ def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
 
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
+    A_pruned_fp8, scale_A = to_float8(A_pruned)
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
     reason="Semi structured matmul is not supported on this GPU type.")
@@ -110,7 +153,8 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
     M, N, K = (32, 64, 32)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b098330f1dfa8..4a3bfcba2bbe0 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -715,11 +715,13 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
 
 def semi_structured_fp8_mm(A_compressed: torch.Tensor,
                            B_dense: torch.Tensor,
+                           alpha: Optional[torch.Tensor] = None,
                            bias: Optional[torch.Tensor] = None,
                            transpose_result: bool = False) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    bias, transpose_result)
+                                                    alpha, bias,
+                                                    transpose_result)
 
 
 # int8
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 201814b4f0401..66913833a7e53 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -47,20 +47,42 @@ def decompress_torch_sparse_semi_structured_mat(sp_mat):
 
 
 def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
-                                      b_dense: torch.Tensor):
+                                      b_dense: torch.Tensor,
+                                      bias: torch.Tensor = None):
     assert a_sparse.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
     if a_sparse.dtype == torch.float8_e4m3fn:
         return semi_structured_fp8_mm(a_sparse.packed,
                                       b_dense,
+                                      bias=bias,
                                       transpose_result=False)
     else:
         return torch.mm(a_sparse, b_dense)
 
 
-def semi_structured_dense_sparse_T_gemm(a: torch.Tensor, b_T: torch.Tensor):
-    return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
+def semi_structured_dense_sparse_T_gemm(a: torch.Tensor,
+                                        b_T: torch.Tensor,
+                                        bias: torch.Tensor = None):
+    return (semi_structured_sparse_dense_gemm(b_T, a.t(), bias)).t()
+
+
+def semi_structured_sparse_dense_gemm_scaled(a_sparse: torch.Tensor,
+                                             b_dense: torch.Tensor,
+                                             scale_a: torch.Tensor,
+                                             scale_b: torch.Tensor,
+                                             bias: torch.Tensor = None):
+    assert (a_sparse.dtype == torch.float8_e4m3fn
+            and b_dense.dtype == torch.float8_e4m3fn)
+    assert not b_dense.is_contiguous(
+    ), "cusparseLt requires dense matrix be non-contiguous"
+    # cusparseLt requires alpha to be float
+    assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
+    return semi_structured_fp8_mm(a_sparse.packed,
+                                  b_dense,
+                                  alpha=scale_a * scale_b,
+                                  bias=bias,
+                                  transpose_result=False)
 
 
 # test utils

From 3367704c49aa82c24b995d5c4936a05c43b04524 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Mon, 28 Oct 2024 15:14:05 +0000
Subject: [PATCH 24/39] Add docstrings

---
 .../sparsity/utils/cusparse_2_4_utils.py      | 111 +++++++++++++-----
 1 file changed, 79 insertions(+), 32 deletions(-)

diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 66913833a7e53..bdae2e9f765fe 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -10,11 +10,19 @@
 from vllm.platforms import current_platform
 
 
-def compress_to_torch_sparse_semi_structured_mat(original_tensor):
-    if original_tensor.dtype == torch.float8_e4m3fn:
-        packed = semi_structured_fp8_compress(original_tensor)
+def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
+    '''
+    Compresses original pruned (with zeros) tensor into packed version
+    Args:
+        pruned_tensor(torch.Tensor) - pruned but not packed tensor
+    Returns: 
+        torch.SparseSemiStructuredTensorCUSPARSELT: torch wrapped cusparseLt-packed tensor. 
+    '''
+    
+    if pruned_tensor.dtype == torch.float8_e4m3fn:
+        packed = semi_structured_fp8_compress(pruned_tensor)
         return SparseSemiStructuredTensorCUSPARSELT(
-            shape=original_tensor.shape,
+            shape=pruned_tensor.shape,
             packed=packed,
             meta=None,
             packed_t=None,
@@ -23,62 +31,101 @@ def compress_to_torch_sparse_semi_structured_mat(original_tensor):
             fuse_transpose_cusparselt=SparseSemiStructuredTensor.
             _FUSE_TRANSPOSE,
             alg_id_cusparselt=SparseSemiStructuredTensor._DEFAULT_ALG_ID,
-            requires_grad=original_tensor.requires_grad,
+            requires_grad=pruned_tensor.requires_grad,
         )
     else:
-        return to_sparse_semi_structured(original_tensor)
-
-
-def decompress_torch_sparse_semi_structured_mat(sp_mat):
-    if sp_mat.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(sp_mat.packed,
-                                      torch.eye(sp_mat.shape[-1],
-                                                dtype=sp_mat.dtype,
-                                                device=sp_mat.device).t(),
+        return to_sparse_semi_structured(pruned_tensor)
+
+#  
+def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
+    '''
+    Unpacks the cusparseLt packed tensor into pruned tensor
+    Args:
+        packed_tensor - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
+    Returns:
+        pruned (torch.Tensor) - pruned torch.tensor
+    '''
+    if packed_tensor.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_mm(packed_tensor.packed,
+                                      torch.eye(packed_tensor.shape[-1],
+                                                dtype=packed_tensor.dtype,
+                                                device=packed_tensor.device).t(),
                                       transpose_result=False)
     else:
         # Fix of to_dense() function supporting int8
         # cuSparseLT for int8 requires dense matrix to be non-contiguous
         return torch.mm(
-            sp_mat,
-            torch.eye(sp_mat.shape[-1],
-                      dtype=sp_mat.dtype,
-                      device=sp_mat.device).t())
+            packed_tensor,
+            torch.eye(packed_tensor.shape[-1],
+                      dtype=packed_tensor.dtype,
+                      device=packed_tensor.device).t())
 
 
-def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
+def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
                                       b_dense: torch.Tensor,
                                       bias: torch.Tensor = None):
-    assert a_sparse.dtype in [
+    '''
+    Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices
+    Args:
+        a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
+        b_dense (torch.Tensor) - dense matrix tensor.
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None. 
+    Result:
+        torch.Tensor - Result of matrix multiplication.
+    '''
+    assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
-    if a_sparse.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(a_sparse.packed,
+    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
+    if a_packed.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_mm(a_packed.packed,
                                       b_dense,
                                       bias=bias,
                                       transpose_result=False)
     else:
-        return torch.mm(a_sparse, b_dense)
+        return torch.mm(a_packed, b_dense)
 
 
-def semi_structured_dense_sparse_T_gemm(a: torch.Tensor,
-                                        b_T: torch.Tensor,
+def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
+                                        b_T_packed: torch.Tensor,
                                         bias: torch.Tensor = None):
-    return (semi_structured_sparse_dense_gemm(b_T, a.t(), bias)).t()
-
-
-def semi_structured_sparse_dense_gemm_scaled(a_sparse: torch.Tensor,
+    '''
+    Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
+    Args:
+        a_dense (torch.Tensor) - dense matrix tensor.
+        b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+    
+    Returns:
+        torch.Tensor - Result of matrix multiplication.
+    '''
+    return (semi_structured_sparse_dense_gemm(b_T_packed, a_dense.t(), bias)).t()
+
+
+def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
                                              b_dense: torch.Tensor,
                                              scale_a: torch.Tensor,
                                              scale_b: torch.Tensor,
                                              bias: torch.Tensor = None):
-    assert (a_sparse.dtype == torch.float8_e4m3fn
+    '''
+    Performs scaled matrix multiplication (a @ b) of transposed semi-structured sparse and dense fp8 matrices
+    Args:
+        a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
+        b_dense (torch.Tensor) - dense matrix tensor.
+        scale_a (torch.Tensor) - scaling factor for sparse matrix, must be in float32.
+        scale_b (torch.Tensor) - scaling factor for dense matrix, must be in float32.
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+
+    Returns:
+        torch.Tensor - Result of matrix multiplication.
+    '''
+
+    assert (a_packed.dtype == torch.float8_e4m3fn
             and b_dense.dtype == torch.float8_e4m3fn)
     assert not b_dense.is_contiguous(
     ), "cusparseLt requires dense matrix be non-contiguous"
     # cusparseLt requires alpha to be float
     assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
-    return semi_structured_fp8_mm(a_sparse.packed,
+    return semi_structured_fp8_mm(a_packed.packed,
                                   b_dense,
                                   alpha=scale_a * scale_b,
                                   bias=bias,

From 368beec9c4438b907157a0dd17b56b6200dc999b Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 30 Oct 2024 09:37:36 +0000
Subject: [PATCH 25/39] Update for torch 2.5

---
 vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index bdae2e9f765fe..7770bbfd09138 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -9,6 +9,7 @@
                               semi_structured_fp8_mm)
 from vllm.platforms import current_platform
 
+SparseSemiStructuredTensor._FORCE_CUTLASS = False
 
 def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
     '''

From 5be53f3027b6311bc4840ff42ba74331217fd531 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 30 Oct 2024 10:13:17 +0000
Subject: [PATCH 26/39] Add handling contiguous dense input for int8 and fp8

---
 tests/kernels/test_semi_structured.py              | 14 +++++++++-----
 .../layers/sparsity/utils/cusparse_2_4_utils.py    |  5 ++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 0b54006e8997d..c098be7820d7c 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -65,15 +65,19 @@ def test_semi_structured_fp8_compress(size):
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
-    if dtype is torch.int8:
-        pytest.skip("cusparse does not support sparse x non transposed dense")
+    # if dtype is torch.int8:
+    #     pytest.skip("cusparse does not support sparse x non transposed dense")
     M, N, K = mnk
     A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
     B = get_random_mat(K, N, dtype)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B)
-    C = dense_matmul(A_pruned, B, dtype)
-    torch.testing.assert_close(C, C_sparse)
+    if dtype is torch.int8:
+        with pytest.raises(ValueError) as e:
+            C_sparse = semi_structured_sparse_dense_gemm(A, B)
+    else:
+        C_sparse = semi_structured_sparse_dense_gemm(A, B)
+        C = dense_matmul(A_pruned, B, dtype)
+        torch.testing.assert_close(C, C_sparse)
 
 
 @pytest.mark.skipif(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 7770bbfd09138..4afe325f4e3a9 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -66,7 +66,8 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
                                       b_dense: torch.Tensor,
                                       bias: torch.Tensor = None):
     '''
-    Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices
+    Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices.
+    In case of int8 and fp8 types, dense matrix B has to be non-contiguous.
     Args:
         a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
         b_dense (torch.Tensor) - dense matrix tensor.
@@ -77,6 +78,8 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
+    if b_dense.is_contiguous() and a_packed.dtype in [torch.int8, torch.float8_e4m3fn]:
+        raise ValueError("cuSparseLt does not support contiguous dense matrix for int8 and fp8 types")
     if a_packed.dtype == torch.float8_e4m3fn:
         return semi_structured_fp8_mm(a_packed.packed,
                                       b_dense,

From f9546a8b1cd15d7d9d33b34a4fea4e6ccbb52bf8 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 13:56:35 +0000
Subject: [PATCH 27/39] Add fp8 cusparseLt

---
 .../fp8_semi_structured/cusparseLt.h          | 244 ++++++++++++++++++
 tests/kernels/test_semi_structured.py         |   2 +
 vllm/_custom_ops.py                           |   8 +
 .../sparsity/utils/cusparse_2_4_utils.py      |  11 +
 4 files changed, 265 insertions(+)
 create mode 100644 csrc/quantization/fp8_semi_structured/cusparseLt.h

diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
new file mode 100644
index 0000000000000..867705c117074
--- /dev/null
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -0,0 +1,244 @@
+#include <cusparse.h>
+#include <torch/all.h>
+
+#include <cusparseLt.h>
+#include <cuda_fp8.h>
+
+namespace vllm {
+
+
+cusparseLtHandle_t handle;
+bool handle_initialized = false;
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    
+    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // create sparse descriptor, dtype
+    auto compression_factor = 9;
+    cusparseLtMatDescriptor_t input_descriptor;
+    cudaDataType type = CUDA_R_8F_E4M3;
+    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &input_descriptor,
+        input.size(0),
+        input.size(1),
+        input.size(1),
+        16,
+        type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    size_t compressed_size, compressed_buffer_size;
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
+        &handle,
+        &input_descriptor,
+        &compressed_size,
+        &compressed_buffer_size));
+
+    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
+    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
+        &handle,
+        &input_descriptor,
+        true,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        input.data_ptr(),
+        compressed_tensor.data_ptr(),
+        compressedBufferPtr.get(),
+        stream));
+    return compressed_tensor;
+}
+
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A,
+    const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt,
+    bool transpose_result
+)
+{
+    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
+    
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // cusparseLt data structures
+    cusparseLtMatmulDescriptor_t matmul;
+    cusparseLtMatmulPlan_t plan;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    
+    float alpha = 1.0;
+    float beta = 0.0;
+    cudaDataType input_type = CUDA_R_8F_E4M3;
+    cudaDataType output_type;
+    cudaDataType C_type;
+    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+    auto compression_factor = 9;
+    ScalarType out_dtype = dense_B.scalar_type();
+
+    switch (out_dtype)
+    {
+        case at::ScalarType::Float8_e4m3fn:
+            output_type = CUDA_R_8F_E4M3;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::Half:
+            output_type = CUDA_R_16F;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::BFloat16:
+            output_type = CUDA_R_16BF;
+            C_type = CUDA_R_16BF;
+            break;
+        case at::ScalarType::Float:
+            output_type = CUDA_R_32F;
+            C_type = CUDA_R_32F;
+            break;
+        default:
+            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
+            break;
+    }
+
+    int64_t k = dense_B.size(0);
+    int64_t n = dense_B.size(1);
+    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
+
+
+    //initialize sparse descriptor
+    cusparseLtMatDescriptor_t sparse_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &sparse_input_descriptor,
+        m,
+        k,
+        k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    // initialize dense input descriptor
+    cusparseLtMatDescriptor_t dense_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &dense_input_descriptor,
+        (dense_B.is_contiguous()) ? k : n,
+        (dense_B.is_contiguous()) ? n : k,
+        (dense_B.is_contiguous()) ? n : k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW));
+    
+    // create result tensor
+    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
+                                        : at::empty({m, n}, res_tensor_options);
+
+    cusparseLtMatDescriptor_t res_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &res_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        output_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    cusparseLtMatDescriptor_t C_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &C_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        C_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle,
+      &matmul,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
+      &sparse_input_descriptor,
+      &dense_input_descriptor,
+      &C_descriptor,
+      &res_descriptor,
+      compute_type));
+    
+    // set bias pointer for matmul, need to assign to get location
+    if (bias_opt.has_value()) {
+        auto& bias = bias_opt.value();
+        void* dBias = bias.data_ptr();
+        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
+    }
+
+    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
+                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
+    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
+    size_t workspace_size;
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+
+
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    auto workspacePtr = allocator.allocate(workspace_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+        &handle,
+        &plan,
+        &alpha,
+        compressed_A.data_ptr(),
+        dense_B.data_ptr(),
+        &beta,
+        res.data_ptr(),
+        res.data_ptr(),
+        workspacePtr.get(),
+        // jank because of the way we want this to be an array of streams
+        &stream,
+        1));
+
+    // Destroy descriptors
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
+    // Destroy plan
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
+    return res;
+}
+#else
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
+}
+
+at::Tensor cslt_mm_fp8_semi_structured(
+    const Tensor& compressed_A,
+    const Tensor& dense_B,
+    const std::optional<Tensor>& bias_opt,
+    bool transpose_result,
+)
+{
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
+#endif
+}
+
+#endif
+
+
+} // namespace vllm
\ No newline at end of file
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c098be7820d7c..338c4dfa79c84 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -36,6 +36,8 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
+    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
+        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
         compress_to_torch_sparse_semi_structured_mat(input_pruned))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4a3bfcba2bbe0..50c001bbd6725 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -706,6 +706,14 @@ def scaled_fp8_quant(
 
     return output, scale
 
+# semi structured fp8
+def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
+    assert input.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_compress_fp8_semi_structured(input)
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+    assert A_compressed.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
 
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 4afe325f4e3a9..e966bf1ed0ed7 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -147,6 +147,17 @@ def dense_matmul(A, B, dtype):
         return A @ B
 
 
+# test utils
+def dense_matmul(A, B, dtype):
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
+                                     torch.bfloat16).to(dtype)
+    else:
+        return A @ B
+
+
 def is_semi_structured_supported() -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False

From 2187236f06fec0d805695e0aeb80655ee7b53d24 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Sun, 13 Oct 2024 20:55:04 +0000
Subject: [PATCH 28/39] Fix compilation and tests

---
 .../fp8_semi_structured/cusparseLt.h          | 244 ------------------
 tests/kernels/test_semi_structured.py         |   2 -
 vllm/_custom_ops.py                           |  11 +-
 .../sparsity/utils/cusparse_2_4_utils.py      |  16 +-
 4 files changed, 13 insertions(+), 260 deletions(-)
 delete mode 100644 csrc/quantization/fp8_semi_structured/cusparseLt.h

diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
deleted file mode 100644
index 867705c117074..0000000000000
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ /dev/null
@@ -1,244 +0,0 @@
-#include <cusparse.h>
-#include <torch/all.h>
-
-#include <cusparseLt.h>
-#include <cuda_fp8.h>
-
-namespace vllm {
-
-
-cusparseLtHandle_t handle;
-bool handle_initialized = false;
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
-
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    
-    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // create sparse descriptor, dtype
-    auto compression_factor = 9;
-    cusparseLtMatDescriptor_t input_descriptor;
-    cudaDataType type = CUDA_R_8F_E4M3;
-    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &input_descriptor,
-        input.size(0),
-        input.size(1),
-        input.size(1),
-        16,
-        type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    size_t compressed_size, compressed_buffer_size;
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-        &handle,
-        &input_descriptor,
-        &compressed_size,
-        &compressed_buffer_size));
-
-    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
-    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
-        &handle,
-        &input_descriptor,
-        true,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        input.data_ptr(),
-        compressed_tensor.data_ptr(),
-        compressedBufferPtr.get(),
-        stream));
-    return compressed_tensor;
-}
-
-torch::Tensor cslt_mm_fp8_semi_structured(
-    const torch::Tensor& compressed_A,
-    const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& bias_opt,
-    bool transpose_result
-)
-{
-    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
-    
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // cusparseLt data structures
-    cusparseLtMatmulDescriptor_t matmul;
-    cusparseLtMatmulPlan_t plan;
-    cusparseLtMatmulAlgSelection_t alg_sel;
-    
-    float alpha = 1.0;
-    float beta = 0.0;
-    cudaDataType input_type = CUDA_R_8F_E4M3;
-    cudaDataType output_type;
-    cudaDataType C_type;
-    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-    auto compression_factor = 9;
-    ScalarType out_dtype = dense_B.scalar_type();
-
-    switch (out_dtype)
-    {
-        case at::ScalarType::Float8_e4m3fn:
-            output_type = CUDA_R_8F_E4M3;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::Half:
-            output_type = CUDA_R_16F;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::BFloat16:
-            output_type = CUDA_R_16BF;
-            C_type = CUDA_R_16BF;
-            break;
-        case at::ScalarType::Float:
-            output_type = CUDA_R_32F;
-            C_type = CUDA_R_32F;
-            break;
-        default:
-            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
-            break;
-    }
-
-    int64_t k = dense_B.size(0);
-    int64_t n = dense_B.size(1);
-    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
-
-
-    //initialize sparse descriptor
-    cusparseLtMatDescriptor_t sparse_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &sparse_input_descriptor,
-        m,
-        k,
-        k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    // initialize dense input descriptor
-    cusparseLtMatDescriptor_t dense_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &dense_input_descriptor,
-        (dense_B.is_contiguous()) ? k : n,
-        (dense_B.is_contiguous()) ? n : k,
-        (dense_B.is_contiguous()) ? n : k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW));
-    
-    // create result tensor
-    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
-                                        : at::empty({m, n}, res_tensor_options);
-
-    cusparseLtMatDescriptor_t res_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &res_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        output_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    cusparseLtMatDescriptor_t C_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &C_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        C_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &handle,
-      &matmul,
-      CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
-      &sparse_input_descriptor,
-      &dense_input_descriptor,
-      &C_descriptor,
-      &res_descriptor,
-      compute_type));
-    
-    // set bias pointer for matmul, need to assign to get location
-    if (bias_opt.has_value()) {
-        auto& bias = bias_opt.value();
-        void* dBias = bias.data_ptr();
-        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
-    }
-
-    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
-                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
-    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
-    size_t workspace_size;
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
-
-
-    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-    auto workspacePtr = allocator.allocate(workspace_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-        &handle,
-        &plan,
-        &alpha,
-        compressed_A.data_ptr(),
-        dense_B.data_ptr(),
-        &beta,
-        res.data_ptr(),
-        res.data_ptr(),
-        workspacePtr.get(),
-        // jank because of the way we want this to be an array of streams
-        &stream,
-        1));
-
-    // Destroy descriptors
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
-    // Destroy plan
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
-    return res;
-}
-#else
-
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
-}
-
-at::Tensor cslt_mm_fp8_semi_structured(
-    const Tensor& compressed_A,
-    const Tensor& dense_B,
-    const std::optional<Tensor>& bias_opt,
-    bool transpose_result,
-)
-{
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
-#endif
-}
-
-#endif
-
-
-} // namespace vllm
\ No newline at end of file
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 338c4dfa79c84..c098be7820d7c 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -36,8 +36,6 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
-    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
-        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
         compress_to_torch_sparse_semi_structured_mat(input_pruned))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 50c001bbd6725..2f46003a5d582 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -706,14 +706,21 @@ def scaled_fp8_quant(
 
     return output, scale
 
+
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     assert input.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor,
+                           B_dense: torch.Tensor,
+                           bias: Optional[torch.Tensor] = None,
+                           transpose_result: bool = False) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
+                                                    bias, transpose_result)
+
 
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index e966bf1ed0ed7..58af70318aac8 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -3,6 +3,9 @@
 from torch.sparse import (SparseSemiStructuredTensor,
                           SparseSemiStructuredTensorCUSPARSELT,
                           to_sparse_semi_structured)
+from torch.sparse import (SparseSemiStructuredTensor,
+                          SparseSemiStructuredTensorCUSPARSELT,
+                          to_sparse_semi_structured)
 
 from vllm import _custom_ops as ops
 from vllm._custom_ops import (semi_structured_fp8_compress,
@@ -37,7 +40,7 @@ def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
     else:
         return to_sparse_semi_structured(pruned_tensor)
 
-#  
+
 def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
     '''
     Unpacks the cusparseLt packed tensor into pruned tensor
@@ -147,17 +150,6 @@ def dense_matmul(A, B, dtype):
         return A @ B
 
 
-# test utils
-def dense_matmul(A, B, dtype):
-    if dtype in [torch.int8, torch.float8_e4m3fn]:
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
-                                     torch.bfloat16).to(dtype)
-    else:
-        return A @ B
-
-
 def is_semi_structured_supported() -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False

From f45a83b481ef8d813a80c04ab8e4439fe5017174 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 23 Oct 2024 16:53:42 +0000
Subject: [PATCH 29/39] Add caching of cusparseLT meta

---
 CMakeLists.txt                                |   9 +-
 .../cusparseLt_benchmarks/benchmark_24.py     |   9 +-
 csrc/ops.h                                    |   7 +
 .../fp8_semi_structured/cusparseLt.cpp        | 259 ++++++++++++++++--
 csrc/torch_bindings.cpp                       |  14 +
 vllm/_custom_ops.py                           |  24 +-
 6 files changed, 274 insertions(+), 48 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c224c1377769b..592fb6f4ea581 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -402,16 +402,13 @@ define_gpu_extension_target(
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
 # If cuSparseLt is not installed we skip 2:4 optimizations
-CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT_H)
-
-# TODO has to be fixed.
+CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT)
+message(STATUS "Result of include cusparseLt ${HAVE_CUSPARSELT}")
 target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
 
-# if(HAVE_CUSPARSELT_H)
-#   message(STATUS "cusparseLt found")
+# if(HAVE_CUSPARSELT)
 #   target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
 # endif()
-
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index aa3328b0f17cf..426fb653598a2 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -114,6 +114,9 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
+    # MKNs = [(2048, 8192, 14336)]
+    # MKNs = [(32, 11008, 4096)]
+    MKNs = [(2048, 11008, 14336)]
     for m, k, n in MKNs:
         timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
@@ -130,9 +133,9 @@ def make_output(data: Iterable[TMeasurement],
     print_timers(data)
 
     # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
+    # timestamp = int(time.time()) if timestamp is None else timestamp
+    # with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+    #     pkl.dump(data, f)
 
 
 def run_model_bench(args):
diff --git a/csrc/ops.h b/csrc/ops.h
index 8943282953db3..690ea5b18939a 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -231,4 +231,11 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
+int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
+                                            const torch::Tensor& dense_B);
+
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
+
+void cslt_fp8_semi_structured_destroy(int64_t id);
+
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index a5e8d0ca6c0bf..a79076567b3d2 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -4,10 +4,13 @@
 #include <ATen/cuda/CUDAContext.h>
 
 #define STUB_FUNC_IMPL()                                                     \
+  torch::Tensor cslt_compress_fp8_semi_structured(                           \
+      const torch::Tensor& input) {                                          \
   torch::Tensor cslt_compress_fp8_semi_structured(                           \
       const torch::Tensor& input) {                                          \
     TORCH_CHECK(false,                                                       \
-                "Unsupported dtype for compressed matrix in current "        \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
                 "version of cuSPARSELt.");                                   \
   }                                                                          \
                                                                              \
@@ -17,7 +20,29 @@
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
-  }
+  }                                                                          \
+                                                                             \
+  int64_t cslt_prepare_mm_fp8_semi_structured(                               \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B) {     \
+    TORCH_CHECK(false,                                                       \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+  }                                                                          \
+                                                                             \
+  torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id) {           \
+    TORCH_CHECK(false,                                                       \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+  }                                                                          \
+                                                                             \
+  void cslt_fp8_semi_structured_destroy(int64_t id) {                        \
+    TORCH_CHECK(false,                                                       \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+  }                                                                          \
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -33,15 +58,49 @@
                     " when calling `" #EXPR "`");                  \
       } while (0)
 
+namespace vllm {
+namespace cusparseLt {
+
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
+using cacheID = int64_t;
+
+struct cusparseLtEntry {
+  // cusparseLtEntry(): device() {}
+  int m;
+  int n;
+  int k;
+
+  cusparseLtMatmulDescriptor_t matmul;
+  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatmulAlgSelection_t alg_sel;
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+  cusparseLtMatDescriptor_t dense_input_descriptor;
+  cusparseLtMatDescriptor_t res_descriptor;
+  cusparseLtMatDescriptor_t C_descriptor;
+
+  void* sparse_mat_ptr;
+  void* dense_mat_ptr;
+
+  torch::Device device = torch::kCUDA;
+  torch::Dtype out_dtype;
+
+  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator;
+  c10::DataPtr workspace_ptr;
+};
+
+std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
+
+}  // namespace cusparseLt
+}  // namespace vllm
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress")
-  if (!handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-    handle_initialized = true;
+              "Only float8 e4m3 is supported in vllm:cslt_compress");
+  namespace vc = vllm::cusparseLt;
+  if (!vc::handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
+    vc::handle_initialized = true;
   }
   // create sparse descriptor, dtype
   auto compression_factor = 9;
@@ -51,35 +110,181 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
       input.new_empty(input.numel() * compression_factor / 16);
 
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
+      &vc::handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
       16, type, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   size_t compressed_size, compressed_buffer_size;
   TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-      &handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
+      &vc::handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
 
   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
-      &handle, &input_descriptor, true, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      &vc::handle, &input_descriptor, true, CUSPARSE_OPERATION_NON_TRANSPOSE,
       input.data_ptr(), compressed_tensor.data_ptr(), compressedBufferPtr.get(),
       stream));
   return compressed_tensor;
 }
 
+// vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const
+// torch::Tensor& compressed_A, const torch::Tensor& dense_B) {
+int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
+                                            const torch::Tensor& dense_B) {
+  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
+              "Only float8 e4m3 is supported in vllm:cslt_compress");
+  namespace vc = vllm::cusparseLt;
+  if (!vc::handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vllm::cusparseLt::handle));
+    vc::handle_initialized = true;
+  }
+  vc::cacheID id;
+  if (vc::cusparseLt_cache.empty()) {
+    id = 0;
+  } else {
+    id = vc::cusparseLt_cache.rbegin()->first + 1;
+  }
+  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+
+  float alpha = 1.0;
+  float beta = 0.0;
+  cudaDataType input_type = CUDA_R_8F_E4M3;
+  cudaDataType output_type;
+  cudaDataType C_type;
+  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  auto compression_factor = 9;
+  auto out_dtype = dense_B.scalar_type();
+
+  int64_t k = dense_B.size(0);
+  int64_t n = dense_B.size(1);
+  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+
+  switch (out_dtype) {
+    case at::ScalarType::Float8_e4m3fn:
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::Half:
+      output_type = CUDA_R_16F;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::BFloat16:
+      output_type = CUDA_R_16BF;
+      C_type = CUDA_R_16BF;
+      break;
+    case at::ScalarType::Float:
+      output_type = CUDA_R_32F;
+      C_type = CUDA_R_32F;
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
+                  "float32} for fp8 inputs");
+      break;
+  }
+
+  // initialize sparse descriptor
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &vc::handle, &entry.sparse_input_descriptor, m, k, k, 16, input_type,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  // initialize dense descriptor
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &entry.dense_input_descriptor,
+      (dense_B.is_contiguous()) ? k : n, (dense_B.is_contiguous()) ? n : k,
+      (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
+
+  // initialize result descriptor
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&vc::handle, &entry.res_descriptor, m, n, m,
+                                    16, output_type, CUSPARSE_ORDER_ROW));
+
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&vc::handle, &entry.C_descriptor, m, n, n,
+                                    16, C_type, CUSPARSE_ORDER_ROW));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
+                                : CUSPARSE_OPERATION_TRANSPOSE,
+      &entry.sparse_input_descriptor, &entry.dense_input_descriptor,
+      &entry.C_descriptor, &entry.res_descriptor, compute_type));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
+      &vc::handle, &entry.alg_sel, &entry.matmul,
+      CUSPARSELT_MATMUL_ALG_DEFAULT));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
+      &vc::handle, &entry.plan, &entry.matmul, &entry.alg_sel));
+  
+  size_t workspace_size;
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulGetWorkspace(&vc::handle, &entry.plan, &workspace_size));
+
+  entry.allocator = c10::cuda::CUDACachingAllocator::get();
+  entry.workspace_ptr = entry.allocator->allocate(workspace_size);
+  entry.device = dense_B.device();
+  entry.out_dtype = out_dtype;
+  entry.m = m;
+  entry.n = n;
+  entry.k = k;
+  return id;
+}
+
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(
+    vllm::cusparseLt::cacheID id) {
+  namespace vc = vllm::cusparseLt;
+  TORCH_CHECK(vc::handle_initialized,
+              "Call of matmul with unintialized matmul");
+  if (vc::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  const auto& entry = vc::cusparseLt_cache[id];
+
+  auto res_tensor_options =
+      c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
+  at::Tensor res = at::empty({entry.m, entry.n}, res_tensor_options);
+  float alpha = 1.0;
+  float beta = 0.0;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmul(&vc::handle, &entry.plan, &alpha, entry.sparse_mat_ptr,
+                       entry.dense_mat_ptr, &beta, res.data_ptr(),
+                       res.data_ptr(), entry.workspace_ptr.get(), &stream, 1));
+
+  return res;
+}
+
+void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
+  TORCH_CHECK(vllm::cusparseLt::handle_initialized,
+              "Call of destroy cusparseId with unintialized cusparseLt");
+  if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
+
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&entry.dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.res_descriptor));
+  // Destroy plan
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&entry.plan));
+}
+
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
-
-  if (!handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-    handle_initialized = true;
+  namespace vc = vllm::cusparseLt;
+  if (!vc::handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
+    vc::handle_initialized = true;
   }
+
   // cusparseLt data structures
   cusparseLtMatmulDescriptor_t matmul;
   cusparseLtMatmulPlan_t plan;
@@ -126,13 +331,13 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &handle, &sparse_input_descriptor, m, k, k, 16, input_type,
+      &vc::handle, &sparse_input_descriptor, m, k, k, 16, input_type,
       CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   // initialize dense input descriptor
   cusparseLtMatDescriptor_t dense_input_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, &dense_input_descriptor, (dense_B.is_contiguous()) ? k : n,
+      &vc::handle, &dense_input_descriptor, (dense_B.is_contiguous()) ? k : n,
       (dense_B.is_contiguous()) ? n : k, (dense_B.is_contiguous()) ? n : k, 16,
       input_type, CUSPARSE_ORDER_ROW));
 
@@ -144,17 +349,17 @@ torch::Tensor cslt_mm_fp8_semi_structured(
 
   cusparseLtMatDescriptor_t res_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      &vc::handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
       output_type,
       (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
 
   cusparseLtMatDescriptor_t C_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
+      &vc::handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
       (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      &vc::handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
       (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
                                 : CUSPARSE_OPERATION_TRANSPOSE,
       &sparse_input_descriptor, &dense_input_descriptor, &C_descriptor,
@@ -165,7 +370,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     auto& bias = bias_opt.value();
     void* dBias = bias.data_ptr();
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-        &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
+        &vc::handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
         sizeof(dBias)));
   }
 
@@ -184,27 +389,29 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     }
   }
 
-  cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
-                                   CUSPARSELT_MATMUL_ALG_DEFAULT);
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
+      &vc::handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulPlanInit(&vc::handle, &plan, &matmul, &alg_sel));
 
-  cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+      cusparseLtMatmulGetWorkspace(&vc::handle, &plan, &workspace_size));
 
   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-  auto workspacePtr = allocator.allocate(workspace_size);
+  auto workspace_ptr = allocator.allocate(workspace_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-      &handle, &plan, alpha_ptr, compressed_A.data_ptr(), dense_B.data_ptr(),
-      &beta, res.data_ptr(), res.data_ptr(), workspacePtr.get(), &stream, 1));
+      &vc::handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
+      &beta, res.data_ptr(), res.data_ptr(), workspace_ptr.get(), &stream, 1));
 
   // Destroy descriptors
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&C_descriptor));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
   // Destroy plan
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9c2520c7fcd23..33ef571363937 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -333,6 +333,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
+
+  ops.def(
+      "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
+      "denseB) -> int");
+  ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
+           &cslt_prepare_mm_fp8_semi_structured);
+
+  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
+  ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
+           &cslt_mm_fp8_semi_structured_prepared);
+
+  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
+  ops.impl("cslt_fp8_semi_structured_destroy", torch::kCUDA,
+           &cslt_fp8_semi_structured_destroy);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2f46003a5d582..34c9525cb7401 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -722,21 +722,19 @@ def semi_structured_fp8_mm(A_compressed: torch.Tensor,
                                                     bias, transpose_result)
 
 
-# semi structured fp8
-def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
-    assert input.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_compress_fp8_semi_structured(input)
+def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
+                                   B_dense: torch.Tensor) -> int:
+    assert A_compressed.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
+        A_compressed, B_dense)
 
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor,
-                           B_dense: torch.Tensor,
-                           alpha: Optional[torch.Tensor] = None,
-                           bias: Optional[torch.Tensor] = None,
-                           transpose_result: bool = False) -> torch.Tensor:
-    assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    alpha, bias,
-                                                    transpose_result)
+def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:
+    return torch.ops.cslt_mm_fp8_semi_structured_prepared(cacheId)
+
+
+def semi_structured_fp8_destroy(cacheId: int):
+    torch.ops.cslt_fp8_semi_structured_destroy(cacheId)
 
 
 # int8

From b1aaea5c48d8f579fabcd79784829a3f8c00abd8 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Fri, 25 Oct 2024 14:35:31 +0000
Subject: [PATCH 30/39] Cached cusparseLt

---
 .../cusparseLt_benchmarks/benchmark_24.py     |  29 +++--
 csrc/ops.h                                    |   5 +-
 .../fp8_semi_structured/cusparseLt.cpp        | 104 +++++++++++-------
 csrc/torch_bindings.cpp                       |   4 +-
 tests/test_cusparseLt.cpp                     |  12 ++
 vllm/_custom_ops.py                           |  10 +-
 6 files changed, 107 insertions(+), 57 deletions(-)
 create mode 100644 tests/test_cusparseLt.cpp

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 426fb653598a2..4599964421bc0 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
     is_semi_structured_supported, semi_structured_sparse_dense_gemm)
+from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -79,15 +80,15 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
 
     a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
-                 dense_matmul, a, b, torch.int8))
+    # timers.append(
+    #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
+    #              dense_matmul, a, b, torch.int8))
 
     # cusparseLt i8
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
-                 semi_structured_sparse_dense_gemm,
-                 compress_to_torch_sparse_semi_structured_mat(a), b))
+    # timers.append(
+    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
+    #              semi_structured_sparse_dense_gemm,
+    #              compress_to_torch_sparse_semi_structured_mat(a), b))
 
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
@@ -101,6 +102,13 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
                      semi_structured_sparse_dense_gemm,
                      compress_to_torch_sparse_semi_structured_mat(a), b))
+        
+        a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
+        handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
+        timers.append(
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
+                     semi_structured_fp8_mm_prepared,
+                     torch.tensor([handle], dtype=torch.int64, device='cuda')))
 
     return timers
 
@@ -114,9 +122,6 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
-    # MKNs = [(2048, 8192, 14336)]
-    # MKNs = [(32, 11008, 4096)]
-    MKNs = [(2048, 11008, 14336)]
     for m, k, n in MKNs:
         timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
@@ -181,8 +186,8 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
     for d in model_bench_data:
         all_data.extend(d)
     # pickle all data
-    with open(f"model_bench-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
+    # with open(f"model_bench-{timestamp}.pkl", "wb") as f:
+    #     pkl.dump(all_data, f)
 
 
 if __name__ == '__main__':
diff --git a/csrc/ops.h b/csrc/ops.h
index 690ea5b18939a..655cd0d9d555b 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -232,9 +232,10 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
 int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B);
+                                            const torch::Tensor& dense_B,
+                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
 
 void cslt_fp8_semi_structured_destroy(int64_t id);
 
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index a79076567b3d2..42975d5ff0eea 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -58,27 +58,44 @@
                     " when calling `" #EXPR "`");                  \
       } while (0)
 
+
+
 namespace vllm {
 namespace cusparseLt {
 
-cusparseLtHandle_t handle;
-bool handle_initialized = false;
-using cacheID = int64_t;
-
 struct cusparseLtEntry {
-  // cusparseLtEntry(): device() {}
-  int m;
-  int n;
-  int k;
+  // cusparseLtEntry() {}
+  // void operator=(const cusparseLtEntry& entry) {
+  //   sparse_input_descriptor = entry.sparse_input_descriptor;
+  //   dense_input_descriptor = entry.dense_input_descriptor;
+  //   res_descriptor = entry.res_descriptor;
+  //   C_descriptor = entry.C_descriptor;
+  //   matmul = entry.matmul;
+  //   plan = entry.plan;
+
+  //   sparse_mat_ptr = entry.sparse_mat_ptr;    
+  //   dense_mat_ptr = entry.dense_mat_ptr;
+
+  //   device = std::move(entry.device);
+  //   allocator = entry.allocator;
+  //   out_dtype = std::move(entry.out_dtype);
+
+  //   workspace_ptr = std::move(entry.workspace_ptr);
+
+  //   m = entry.m;
+  //   n = entry.n;
+  //   k = entry.k;
+  // }
 
-  cusparseLtMatmulDescriptor_t matmul;
-  cusparseLtMatmulPlan_t plan;
-  cusparseLtMatmulAlgSelection_t alg_sel;
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   cusparseLtMatDescriptor_t dense_input_descriptor;
   cusparseLtMatDescriptor_t res_descriptor;
   cusparseLtMatDescriptor_t C_descriptor;
 
+  cusparseLtMatmulDescriptor_t matmul;
+  cusparseLtMatmulPlan_t plan;
+
+
   void* sparse_mat_ptr;
   void* dense_mat_ptr;
 
@@ -87,13 +104,23 @@ struct cusparseLtEntry {
 
   c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator;
   c10::DataPtr workspace_ptr;
+
+  int m;
+  int n;
+  int k;
 };
 
-std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
+cusparseLtHandle_t handle;
+bool handle_initialized = false;
+using cacheID = int64_t;
+
 
+std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
 }  // namespace cusparseLt
 }  // namespace vllm
 
+vllm::cusparseLt::cusparseLtEntry entry;
+
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
@@ -128,15 +155,14 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   return compressed_tensor;
 }
 
-// vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const
-// torch::Tensor& compressed_A, const torch::Tensor& dense_B) {
-int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B) {
+vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
+                                            const torch::Tensor& dense_B, 
+                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vllm::cusparseLt::handle));
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
     vc::handle_initialized = true;
   }
   vc::cacheID id;
@@ -145,7 +171,9 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
   } else {
     id = vc::cusparseLt_cache.rbegin()->first + 1;
   }
-  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+
+  // vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+  // vc::cusparseLtEntry entry;
 
   float alpha = 1.0;
   float beta = 0.0;
@@ -155,7 +183,6 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
   cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
   auto compression_factor = 9;
   auto out_dtype = dense_B.scalar_type();
-
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
@@ -183,10 +210,9 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                   "float32} for fp8 inputs");
       break;
   }
-
   // initialize sparse descriptor
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &entry.sparse_input_descriptor, m, k, k, 16, input_type,
+      &vc::handle, &(entry.sparse_input_descriptor), m, k, k, 16, input_type,
       CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   // initialize dense descriptor
@@ -196,13 +222,15 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
       (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
 
   // initialize result descriptor
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtDenseDescriptorInit(&vc::handle, &entry.res_descriptor, m, n, m,
-                                    16, output_type, CUSPARSE_ORDER_ROW));
+ TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &entry.res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      output_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &entry.C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
 
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtDenseDescriptorInit(&vc::handle, &entry.C_descriptor, m, n, n,
-                                    16, C_type, CUSPARSE_ORDER_ROW));
+  cusparseLtMatmulAlgSelection_t alg_sel;
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -210,13 +238,11 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                                 : CUSPARSE_OPERATION_TRANSPOSE,
       &entry.sparse_input_descriptor, &entry.dense_input_descriptor,
       &entry.C_descriptor, &entry.res_descriptor, compute_type));
-
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
-      &vc::handle, &entry.alg_sel, &entry.matmul,
+      &vc::handle, &alg_sel, &entry.matmul,
       CUSPARSELT_MATMUL_ALG_DEFAULT));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-      &vc::handle, &entry.plan, &entry.matmul, &entry.alg_sel));
-  
+      &vc::handle, &entry.plan, &entry.matmul, &alg_sel));
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatmulGetWorkspace(&vc::handle, &entry.plan, &workspace_size));
@@ -228,18 +254,22 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
   entry.m = m;
   entry.n = n;
   entry.k = k;
+  entry.sparse_mat_ptr = compressed_A.data_ptr();
+  entry.dense_mat_ptr = dense_B.data_ptr();
   return id;
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured_prepared(
-    vllm::cusparseLt::cacheID id) {
+    const torch::Tensor& id_tensor) {
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of matmul with unintialized matmul");
-  if (vc::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
-  }
-  const auto& entry = vc::cusparseLt_cache[id];
+  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  // auto id = id_tensor.item<vc::cacheID>();
+  // if (vc::cusparseLt_cache.count(id) == 0) {
+  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
+  // }
+  // const auto& entry = vc::cusparseLt_cache[id];
 
   auto res_tensor_options =
       c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
@@ -262,7 +292,7 @@ void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
   if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
     TORCH_CHECK(false, "cusparse matmul Id is not found");
   }
-  auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
+  // auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
 
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 33ef571363937..755338fb6f559 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -336,11 +336,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
-      "denseB) -> int");
+      "denseB, Tensor!? bias, bool transpose_result) -> int");
   ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_prepare_mm_fp8_semi_structured);
 
-  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
+  ops.def("cslt_mm_fp8_semi_structured_prepared(Tensor cacheId) -> Tensor");
   ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
            &cslt_mm_fp8_semi_structured_prepared);
 
diff --git a/tests/test_cusparseLt.cpp b/tests/test_cusparseLt.cpp
new file mode 100644
index 0000000000000..9c8d3cb813ef1
--- /dev/null
+++ b/tests/test_cusparseLt.cpp
@@ -0,0 +1,12 @@
+ #include <cusparseLt.h>
+
+cusparseLtHandle_t handle;
+
+
+struct Entry {
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+};
+
+int main() {
+    
+}
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 34c9525cb7401..387403ff4d889 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -723,18 +723,20 @@ def semi_structured_fp8_mm(A_compressed: torch.Tensor,
 
 
 def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
-                                   B_dense: torch.Tensor) -> int:
+                                   B_dense: torch.Tensor,
+                                   bias: Optional[torch.Tensor] = None,
+                                   transpose_result: bool = False) -> int:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
-        A_compressed, B_dense)
+        A_compressed, B_dense, bias, transpose_result)
 
 
 def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:
-    return torch.ops.cslt_mm_fp8_semi_structured_prepared(cacheId)
+    return torch.ops._C.cslt_mm_fp8_semi_structured_prepared(cacheId)
 
 
 def semi_structured_fp8_destroy(cacheId: int):
-    torch.ops.cslt_fp8_semi_structured_destroy(cacheId)
+    torch.ops._C.cslt_fp8_semi_structured_destroy(cacheId)
 
 
 # int8

From c36401c1f92c34d3b5aadf464c4128ebaa490a14 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Fri, 25 Oct 2024 15:08:11 +0000
Subject: [PATCH 31/39] Fix destroy function

---
 benchmarks/cusparseLt_benchmarks/benchmark_24.py     | 11 ++++++++---
 csrc/ops.h                                           |  2 +-
 csrc/quantization/fp8_semi_structured/cusparseLt.cpp | 10 ++++++----
 csrc/torch_bindings.cpp                              |  2 +-
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 4599964421bc0..101a9bc20be6e 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
     is_semi_structured_supported, semi_structured_sparse_dense_gemm)
-from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared)
+from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared, semi_structured_fp8_destroy)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -105,11 +105,12 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
         
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
+        id = torch.tensor([handle], dtype=torch.int64, device='cuda')
         timers.append(
             bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
                      semi_structured_fp8_mm_prepared,
-                     torch.tensor([handle], dtype=torch.int64, device='cuda')))
-
+                     id))
+        semi_structured_fp8_destroy(id)
     return timers
 
 
@@ -122,6 +123,10 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
+    # MKNs = [(2048, 8192, 14336)]
+    MKNs = [(32, 11008, 4096)]
+    # MKNs = [(2048, 11008, 14336)]
+
     for m, k, n in MKNs:
         timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
diff --git a/csrc/ops.h b/csrc/ops.h
index 655cd0d9d555b..6fe73a712a947 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -237,6 +237,6 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
 
 torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
 
-void cslt_fp8_semi_structured_destroy(int64_t id);
+void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 42975d5ff0eea..80462925fb993 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -286,12 +286,14 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
   return res;
 }
 
-void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
+void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
   TORCH_CHECK(vllm::cusparseLt::handle_initialized,
               "Call of destroy cusparseId with unintialized cusparseLt");
-  if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
-  }
+  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  // auto id = id_tensor.item<vc::cacheID>();
+  // if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
+  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
+  // }
   // auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
 
   TORCH_CUDASPARSE_CHECK(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 755338fb6f559..f5041430cff85 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -344,7 +344,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
            &cslt_mm_fp8_semi_structured_prepared);
 
-  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
+  ops.def("cslt_fp8_semi_structured_destroy(Tensor cacheId) -> ()");
   ops.impl("cslt_fp8_semi_structured_destroy", torch::kCUDA,
            &cslt_fp8_semi_structured_destroy);
 #endif

From 9f6a46930749d4c6696f6e8d92f603e64cb1e7ac Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Fri, 25 Oct 2024 15:27:51 +0000
Subject: [PATCH 32/39] Prepare for reproduce

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 14 ++++-----
 .../fp8_semi_structured/cusparseLt.cpp        | 31 ++++++++++---------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 101a9bc20be6e..cc9a9e1c2603c 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -93,15 +93,15 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
-        timers.append(
-            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
-                     dense_matmul, a, b, torch.float8_e4m3fn))
+        # timers.append(
+        #     bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
+        #              dense_matmul, a, b, torch.float8_e4m3fn))
 
         # cusparseLt fp8
-        timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-                     semi_structured_sparse_dense_gemm,
-                     compress_to_torch_sparse_semi_structured_mat(a), b))
+        # timers.append(
+        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+        #              semi_structured_sparse_dense_gemm,
+        #              compress_to_torch_sparse_semi_structured_mat(a), b))
         
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 80462925fb993..be242283648d1 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -119,7 +119,7 @@ std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
 }  // namespace cusparseLt
 }  // namespace vllm
 
-vllm::cusparseLt::cusparseLtEntry entry;
+// vllm::cusparseLt::cusparseLtEntry entry;
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
@@ -172,7 +172,7 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
     id = vc::cusparseLt_cache.rbegin()->first + 1;
   }
 
-  // vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
   // vc::cusparseLtEntry entry;
 
   float alpha = 1.0;
@@ -264,12 +264,12 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of matmul with unintialized matmul");
-  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  // auto id = id_tensor.item<vc::cacheID>();
-  // if (vc::cusparseLt_cache.count(id) == 0) {
-  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
-  // }
-  // const auto& entry = vc::cusparseLt_cache[id];
+  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  auto id = id_tensor.item<vc::cacheID>();
+  if (vc::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  const auto& entry = vc::cusparseLt_cache[id];
 
   auto res_tensor_options =
       c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
@@ -287,14 +287,15 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
 }
 
 void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
-  TORCH_CHECK(vllm::cusparseLt::handle_initialized,
+  namespace vc = vllm::cusparseLt;
+  TORCH_CHECK(vc::handle_initialized,
               "Call of destroy cusparseId with unintialized cusparseLt");
-  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  // auto id = id_tensor.item<vc::cacheID>();
-  // if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
-  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
-  // }
-  // auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
+  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  auto id = id_tensor.item<vc::cacheID>();
+  if (vc::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  auto& entry = vc::cusparseLt_cache[id];
 
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));

From 2e56de9f50e43f8f3d0917abc226e46bf841de42 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 30 Oct 2024 15:47:29 +0000
Subject: [PATCH 33/39] Fix cusparseLt caching

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 19 ++--
 csrc/ops.h                                    | 10 +-
 .../fp8_semi_structured/cusparseLt.cpp        | 95 ++++++++-----------
 csrc/torch_bindings.cpp                       | 15 ++-
 tests/kernels/test_semi_structured.py         | 19 ++++
 tests/test_cusparseLt.cpp                     | 12 ---
 vllm/_custom_ops.py                           | 10 +-
 7 files changed, 84 insertions(+), 96 deletions(-)
 delete mode 100644 tests/test_cusparseLt.cpp

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index cc9a9e1c2603c..17d42d386dd6c 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -93,19 +93,20 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
-        # timers.append(
-        #     bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
-        #              dense_matmul, a, b, torch.float8_e4m3fn))
+        timers.append(
+            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
+                     dense_matmul, a, b, torch.float8_e4m3fn))
 
         # cusparseLt fp8
-        # timers.append(
-        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-        #              semi_structured_sparse_dense_gemm,
-        #              compress_to_torch_sparse_semi_structured_mat(a), b))
+        timers.append(
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+                     semi_structured_sparse_dense_gemm,
+                     compress_to_torch_sparse_semi_structured_mat(a), b))
         
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
-        id = torch.tensor([handle], dtype=torch.int64, device='cuda')
+        # id = torch.tensor([handle], dtype=torch.int64, device='cuda')
+        id = int(handle)
         timers.append(
             bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
                      semi_structured_fp8_mm_prepared,
@@ -124,7 +125,7 @@ def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
     # MKNs = [(2048, 8192, 14336)]
-    MKNs = [(32, 11008, 4096)]
+    # MKNs = [(32, 11008, 4096)]
     # MKNs = [(2048, 11008, 14336)]
 
     for m, k, n in MKNs:
diff --git a/csrc/ops.h b/csrc/ops.h
index 6fe73a712a947..8e88bc24f74c8 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -229,14 +229,16 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<torch::Tensor>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
+    const c10::optional<torch::Tensor>& bias_opt);
 
 int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                                             const torch::Tensor& dense_B,
-                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
+                                            const c10::optional<torch::Tensor>& bias_opt);
 
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
+// torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
 
-void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
+// void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
+void cslt_fp8_semi_structured_destroy(int64_t id);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index be242283648d1..36106c25ee179 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -16,7 +16,7 @@
                                                                              \
   torch::Tensor cslt_mm_fp8_semi_structured(                                 \
       const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-      const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
+      const c10::optional<torch::Tensor>& bias_opt) {                        \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
@@ -63,30 +63,8 @@
 namespace vllm {
 namespace cusparseLt {
 
-struct cusparseLtEntry {
-  // cusparseLtEntry() {}
-  // void operator=(const cusparseLtEntry& entry) {
-  //   sparse_input_descriptor = entry.sparse_input_descriptor;
-  //   dense_input_descriptor = entry.dense_input_descriptor;
-  //   res_descriptor = entry.res_descriptor;
-  //   C_descriptor = entry.C_descriptor;
-  //   matmul = entry.matmul;
-  //   plan = entry.plan;
-
-  //   sparse_mat_ptr = entry.sparse_mat_ptr;    
-  //   dense_mat_ptr = entry.dense_mat_ptr;
-
-  //   device = std::move(entry.device);
-  //   allocator = entry.allocator;
-  //   out_dtype = std::move(entry.out_dtype);
-
-  //   workspace_ptr = std::move(entry.workspace_ptr);
-
-  //   m = entry.m;
-  //   n = entry.n;
-  //   k = entry.k;
-  // }
 
+struct cusparseLtEntry {
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   cusparseLtMatDescriptor_t dense_input_descriptor;
   cusparseLtMatDescriptor_t res_descriptor;
@@ -101,9 +79,7 @@ struct cusparseLtEntry {
 
   torch::Device device = torch::kCUDA;
   torch::Dtype out_dtype;
-
-  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator;
-  c10::DataPtr workspace_ptr;
+  void* workspace_ptr;
 
   int m;
   int n;
@@ -157,7 +133,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
 
 vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                                             const torch::Tensor& dense_B, 
-                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
+                                            const c10::optional<torch::Tensor>& bias_opt) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
@@ -173,7 +149,6 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   }
 
   vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
-  // vc::cusparseLtEntry entry;
 
   float alpha = 1.0;
   float beta = 0.0;
@@ -187,6 +162,11 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
 
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+  cusparseLtMatDescriptor_t dense_input_descriptor;
+  cusparseLtMatDescriptor_t res_descriptor;
+  cusparseLtMatDescriptor_t C_descriptor;
+
   switch (out_dtype) {
     case at::ScalarType::Float8_e4m3fn:
       output_type = CUDA_R_8F_E4M3;
@@ -212,43 +192,43 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   }
   // initialize sparse descriptor
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &(entry.sparse_input_descriptor), m, k, k, 16, input_type,
+      &vc::handle, &sparse_input_descriptor, m, k, k, 16, input_type,
       CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   // initialize dense descriptor
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &entry.dense_input_descriptor,
+      &vc::handle, &dense_input_descriptor,
       (dense_B.is_contiguous()) ? k : n, (dense_B.is_contiguous()) ? n : k,
       (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
 
   // initialize result descriptor
- TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &entry.res_descriptor, m, n, (transpose_result) ? m : n, 16,
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &res_descriptor, m, n, n, 16,
       output_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      CUSPARSE_ORDER_ROW));
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &entry.C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
+      CUSPARSE_ORDER_ROW));
 
+  cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
       (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
                                 : CUSPARSE_OPERATION_TRANSPOSE,
-      &entry.sparse_input_descriptor, &entry.dense_input_descriptor,
-      &entry.C_descriptor, &entry.res_descriptor, compute_type));
+      &sparse_input_descriptor, &dense_input_descriptor,
+      &C_descriptor, &res_descriptor, compute_type));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
       &vc::handle, &alg_sel, &entry.matmul,
       CUSPARSELT_MATMUL_ALG_DEFAULT));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-      &vc::handle, &entry.plan, &entry.matmul, &alg_sel));
+      &vc::handle, &plan, &entry.matmul, &alg_sel));
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&vc::handle, &entry.plan, &workspace_size));
+      cusparseLtMatmulGetWorkspace(&vc::handle, &plan, &workspace_size));
+  AT_CUDA_CHECK(cudaMalloc((void**) &entry.workspace_ptr, workspace_size));
 
-  entry.allocator = c10::cuda::CUDACachingAllocator::get();
-  entry.workspace_ptr = entry.allocator->allocate(workspace_size);
   entry.device = dense_B.device();
   entry.out_dtype = out_dtype;
   entry.m = m;
@@ -256,16 +236,19 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   entry.k = k;
   entry.sparse_mat_ptr = compressed_A.data_ptr();
   entry.dense_mat_ptr = dense_B.data_ptr();
+  entry.plan = plan;
+  entry.sparse_input_descriptor = sparse_input_descriptor;
+  entry.dense_input_descriptor = dense_input_descriptor;
+  entry.C_descriptor = C_descriptor;
+  entry.res_descriptor = res_descriptor;
+
   return id;
 }
 
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(
-    const torch::Tensor& id_tensor) {
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(vllm::cusparseLt::cacheID id) {
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of matmul with unintialized matmul");
-  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  auto id = id_tensor.item<vc::cacheID>();
   if (vc::cusparseLt_cache.count(id) == 0) {
     TORCH_CHECK(false, "cusparse matmul Id is not found");
   }
@@ -281,17 +264,15 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatmul(&vc::handle, &entry.plan, &alpha, entry.sparse_mat_ptr,
                        entry.dense_mat_ptr, &beta, res.data_ptr(),
-                       res.data_ptr(), entry.workspace_ptr.get(), &stream, 1));
+                       res.data_ptr(), entry.workspace_ptr, &stream, 1));
 
   return res;
 }
 
-void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
+void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of destroy cusparseId with unintialized cusparseLt");
-  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  auto id = id_tensor.item<vc::cacheID>();
   if (vc::cusparseLt_cache.count(id) == 0) {
     TORCH_CHECK(false, "cusparse matmul Id is not found");
   }
@@ -301,9 +282,12 @@ void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
       cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&entry.dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.C_descriptor));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.res_descriptor));
   // Destroy plan
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&entry.plan));
+  AT_CUDA_CHECK(cudaFree(entry.workspace_ptr));
+  vc::cusparseLt_cache.erase(id);
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured(
@@ -377,19 +361,18 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   // create result tensor
   auto res_tensor_options =
       c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-  at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
-                                      : at::empty({m, n}, res_tensor_options);
+  at::Tensor res = at::empty({m, n}, res_tensor_options);
 
   cusparseLtMatDescriptor_t res_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      &vc::handle, &res_descriptor, m, n, n, 16,
       output_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      CUSPARSE_ORDER_ROW));
 
   cusparseLtMatDescriptor_t C_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
+      CUSPARSE_ORDER_ROW));
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index f5041430cff85..1a272495c46b5 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -329,24 +329,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "Tensor!? alpha, Tensor!? bias, bool transpose_result) -> Tensor");
-
+      "Tensor!? alpha, Tensor!? bias) -> Tensor");
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
 
   ops.def(
       "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
-      "denseB, Tensor!? bias, bool transpose_result) -> int");
+      "denseB, Tensor!? bias) -> int");
   ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_prepare_mm_fp8_semi_structured);
 
-  ops.def("cslt_mm_fp8_semi_structured_prepared(Tensor cacheId) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
-           &cslt_mm_fp8_semi_structured_prepared);
+  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
+  ops.impl("cslt_mm_fp8_semi_structured_prepared", &cslt_mm_fp8_semi_structured_prepared);
 
-  ops.def("cslt_fp8_semi_structured_destroy(Tensor cacheId) -> ()");
-  ops.impl("cslt_fp8_semi_structured_destroy", torch::kCUDA,
-           &cslt_fp8_semi_structured_destroy);
+  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
+  ops.impl("cslt_fp8_semi_structured_destroy", &cslt_fp8_semi_structured_destroy);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c098be7820d7c..143253beffe91 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -9,6 +9,9 @@
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
     semi_structured_sparse_dense_gemm,
     semi_structured_sparse_dense_gemm_scaled)
+from vllm._custom_ops import (semi_structured_fp8_prepare_mm,
+                              semi_structured_fp8_mm_prepared)
+
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -138,6 +141,22 @@ def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
                                                             torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
 
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul_prepared():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+    handle = semi_structured_fp8_prepare_mm(A.packed, B)
+
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_fp8_mm_prepared(int(handle)).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
 
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
diff --git a/tests/test_cusparseLt.cpp b/tests/test_cusparseLt.cpp
deleted file mode 100644
index 9c8d3cb813ef1..0000000000000
--- a/tests/test_cusparseLt.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
- #include <cusparseLt.h>
-
-cusparseLtHandle_t handle;
-
-
-struct Entry {
-  cusparseLtMatDescriptor_t sparse_input_descriptor;
-};
-
-int main() {
-    
-}
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 387403ff4d889..e6c4e2b2d7ac4 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -715,20 +715,18 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
 
 def semi_structured_fp8_mm(A_compressed: torch.Tensor,
                            B_dense: torch.Tensor,
-                           bias: Optional[torch.Tensor] = None,
-                           transpose_result: bool = False) -> torch.Tensor:
+                           bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    bias, transpose_result)
+                                                    bias)
 
 
 def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
                                    B_dense: torch.Tensor,
-                                   bias: Optional[torch.Tensor] = None,
-                                   transpose_result: bool = False) -> int:
+                                   bias: Optional[torch.Tensor] = None) -> int:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
-        A_compressed, B_dense, bias, transpose_result)
+        A_compressed, B_dense, bias)
 
 
 def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:

From 9ad83cb018726dac96ce27a833fcba6c032a673f Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Tue, 5 Nov 2024 15:29:11 +0000
Subject: [PATCH 34/39] Make cached version default function

---
 .../cusparseLt_benchmarks/benchmark_24.py     |  71 ++-
 csrc/ops.h                                    |  15 +-
 .../fp8_semi_structured/cusparseLt.cpp        | 466 ++++++++++--------
 csrc/torch_bindings.cpp                       |  15 +-
 tests/kernels/test_semi_structured.py         |  65 +--
 vllm/_custom_ops.py                           |  31 +-
 .../sparsity/utils/cusparse_2_4_utils.py      |  30 +-
 7 files changed, 355 insertions(+), 338 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 17d42d386dd6c..a3b013034e725 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -1,8 +1,8 @@
 import argparse
 import copy
 import itertools
-import pickle as pkl
-import time
+# import pickle as pkl
+# import time
 from typing import Callable, Iterable, List, Tuple
 
 import torch
@@ -12,8 +12,7 @@
 
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
-    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
-from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared, semi_structured_fp8_destroy)
+    is_semi_structured_supported, semi_structured_sparse_dense_gemm2)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -78,18 +77,36 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             compress_to_torch_sparse_semi_structured_mat(
                 a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16)))
 
-    a, b = make_rand_tensors(torch.int8, m, n, k)
+    # a_compressed = compress_to_torch_sparse_semi_structured_mat(
+    #         a.to(dtype=torch.bfloat16))
+    # b = b.to(torch.bfloat16)
+    a, b = make_rand_tensors(torch.float16, m, n, k)
+    a_compressed = compress_to_torch_sparse_semi_structured_mat(
+        a.to(dtype=torch.bfloat16))
+    # warmup
+    semi_structured_sparse_dense_gemm2(a_compressed, b)
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4_v2",
+                 semi_structured_sparse_dense_gemm2, a_compressed, b))
+
+    # a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
     # timers.append(
     #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
     #              dense_matmul, a, b, torch.int8))
-
+    # a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
     # cusparseLt i8
     # timers.append(
     #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
     #              semi_structured_sparse_dense_gemm,
     #              compress_to_torch_sparse_semi_structured_mat(a), b))
 
+    # warmup
+    # semi_structured_sparse_dense_gemm2(a_compressed, b)
+    # timers.append(
+    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_v2",
+    #              semi_structured_sparse_dense_gemm2, a_compressed, b))
+
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
@@ -97,21 +114,27 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
                      dense_matmul, a, b, torch.float8_e4m3fn))
 
-        # cusparseLt fp8
-        timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-                     semi_structured_sparse_dense_gemm,
-                     compress_to_torch_sparse_semi_structured_mat(a), b))
-        
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
-        handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
-        # id = torch.tensor([handle], dtype=torch.int64, device='cuda')
-        id = int(handle)
+        # cusparseLt fp8
+        # timers.append(
+        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+        #              semi_structured_sparse_dense_gemm,
+        #              a_compressed, b))
+
+        # warmup
+        semi_structured_sparse_dense_gemm2(a_compressed, b)
         timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
-                     semi_structured_fp8_mm_prepared,
-                     id))
-        semi_structured_fp8_destroy(id)
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_v2",
+                     semi_structured_sparse_dense_gemm2, a_compressed, b))
+
+        # handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
+        # id = int(handle)
+        # scale = torch.tensor(1.0, device='cuda', dtype=torch.float32)
+        # # scale = None
+        # timers.append(
+        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
+        #              semi_structured_fp8_mm_prepared, id, scale=scale))
+        # semi_structured_fp8_destroy(id)
     return timers
 
 
@@ -124,7 +147,9 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
+    # MKNs = [(1024, 8192, 14336)]
     # MKNs = [(2048, 8192, 14336)]
+    # MKNs = [(2048, 8192, 14336), (2048, 8192, 14336)]
     # MKNs = [(32, 11008, 4096)]
     # MKNs = [(2048, 11008, 14336)]
 
@@ -186,11 +211,11 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
         print_timers(data)
 
-    timestamp = int(time.time())
+    # timestamp = int(time.time())
 
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
+    # all_data = []
+    # for d in model_bench_data:
+    #     all_data.extend(d)
     # pickle all data
     # with open(f"model_bench-{timestamp}.pkl", "wb") as f:
     #     pkl.dump(all_data, f)
diff --git a/csrc/ops.h b/csrc/ops.h
index 8e88bc24f74c8..cc18915d37762 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -228,17 +228,12 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& alpha_opt,
+    const c10::optional<double>& scale_opt,
     const c10::optional<torch::Tensor>& bias_opt);
 
-int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B,
-                                            const c10::optional<torch::Tensor>& bias_opt);
-
-// torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
-
-// void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
-void cslt_fp8_semi_structured_destroy(int64_t id);
+torch::Tensor cslt_mm_fp8_semi_structured2(
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<double>& scale_opt,
+    const c10::optional<torch::Tensor>& bias_opt);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 36106c25ee179..eafc4ac72736c 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -16,33 +16,20 @@
                                                                              \
   torch::Tensor cslt_mm_fp8_semi_structured(                                 \
       const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+      const c10::optional<torch::Tensor>& scale_opt,                         \
       const c10::optional<torch::Tensor>& bias_opt) {                        \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
   }                                                                          \
-                                                                             \
-  int64_t cslt_prepare_mm_fp8_semi_structured(                               \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B) {     \
-    TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
-                                                                             \
-  torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id) {           \
-    TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
-                                                                             \
-  void cslt_fp8_semi_structured_destroy(int64_t id) {                        \
+  torch::Tensor cslt_mm_fp8_semi_structured2(                                \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+      const c10::optional<torch::Tensor>& scale_opt,                         \
+      const c10::optional<torch::Tensor>& bias_opt) {                        \
     TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
+                "Unsupported dtype for compressed matrix multiplication in " \
+                "current version of cuSPARSELt.");                           \
+  }
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -58,67 +45,228 @@
                     " when calling `" #EXPR "`");                  \
       } while (0)
 
-
-
 namespace vllm {
 namespace cusparseLt {
 
-
 struct cusparseLtEntry {
-  cusparseLtMatDescriptor_t sparse_input_descriptor;
-  cusparseLtMatDescriptor_t dense_input_descriptor;
-  cusparseLtMatDescriptor_t res_descriptor;
-  cusparseLtMatDescriptor_t C_descriptor;
-
-  cusparseLtMatmulDescriptor_t matmul;
-  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatDescriptor_t* sparse_input_descriptor_p;
+  cusparseLtMatDescriptor_t* dense_input_descriptor_p;
+  cusparseLtMatDescriptor_t* res_descriptor_p;
+  cusparseLtMatDescriptor_t* C_descriptor_p;
 
+  cusparseLtMatmulDescriptor_t* matmul_p;
+  cusparseLtMatmulPlan_t* plan_p;
+  cusparseLtMatmulAlgSelection_t* alg_sel_p;
 
-  void* sparse_mat_ptr;
-  void* dense_mat_ptr;
-
-  torch::Device device = torch::kCUDA;
-  torch::Dtype out_dtype;
   void* workspace_ptr;
 
-  int m;
-  int n;
-  int k;
+  ~cusparseLtEntry() {
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(sparse_input_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(dense_input_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(C_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(res_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(plan_p));
+
+    // Destructor is called after the cuda cleanup so double free is done here.
+    // AT_CUDA_CHECK(cudaFree(workspace_ptr));
+    delete sparse_input_descriptor_p;
+    delete dense_input_descriptor_p;
+    delete res_descriptor_p;
+    delete C_descriptor_p;
+    delete plan_p;
+    delete alg_sel_p;
+    delete matmul_p;
+  }
 };
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
-using cacheID = int64_t;
-
+using cacheID = std::tuple<int64_t, int64_t, int64_t, at::ScalarType>;
 
 std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
+
+void prepare_mm_semi_structured(const cacheID& tuple_id,
+                                at::ScalarType out_dtype,
+                                bool is_B_contiguous) {
+  auto m = std::get<0>(tuple_id);
+  auto k = std::get<1>(tuple_id);
+  auto n = std::get<2>(tuple_id);
+  at::ScalarType input_dtype = std::get<3>(tuple_id);
+  auto& entry = cusparseLt_cache[tuple_id];
+
+  cudaDataType input_type;
+  cudaDataType output_type;
+  cudaDataType C_type;
+  cusparseComputeType compute_type;
+
+  switch (input_dtype) {
+    case at::ScalarType::Char:
+      input_type = CUDA_R_8I;
+      output_type = CUDA_R_8I;
+      C_type = CUDA_R_8I;
+      compute_type = CUSPARSE_COMPUTE_32I;
+      break;
+    case at::ScalarType::Half:
+      input_type = CUDA_R_16F;
+      output_type = CUDA_R_16F;
+      C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::BFloat16:
+      input_type = CUDA_R_16BF;
+      output_type = CUDA_R_16BF;
+      C_type = CUDA_R_16BF;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::Float:
+      input_type = CUDA_R_32F;
+      output_type = CUDA_R_32F;
+      C_type = CUDA_R_32F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::Float8_e4m3fn:
+      input_type = CUDA_R_8F_E4M3;
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    default:
+      TORCH_CHECK(
+          false,
+          "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
+      break;
+  }
+
+  // cudaDataType input_type = CUDA_R_8F_E4M3;
+  // cudaDataType output_type;
+  // cudaDataType C_type;
+  // cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  // switch (out_dtype) {
+  //   case at::ScalarType::Float8_e4m3fn:
+  //     output_type = CUDA_R_8F_E4M3;
+  //     C_type = CUDA_R_16F;
+  //     break;
+  //   case at::ScalarType::Half:
+  //     output_type = CUDA_R_16F;
+  //     C_type = CUDA_R_16F;
+  //     break;
+  //   case at::ScalarType::BFloat16:
+  //     output_type = CUDA_R_16BF;
+  //     C_type = CUDA_R_16BF;
+  //     break;
+  //   case at::ScalarType::Float:
+  //     output_type = CUDA_R_32F;
+  //     C_type = CUDA_R_32F;
+  //     break;
+  //   default:
+  //     TORCH_CHECK(false,
+  //                 "Unsupported out_dtype passed, must be one of {fp16, bf16,
+  //                 " "float32} for fp8 inputs");
+  //     break;
+  // }
+  entry.sparse_input_descriptor_p = new cusparseLtMatDescriptor_t();
+  entry.dense_input_descriptor_p = new cusparseLtMatDescriptor_t();
+  entry.res_descriptor_p = new cusparseLtMatDescriptor_t();
+  entry.C_descriptor_p = new cusparseLtMatDescriptor_t();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &handle, entry.sparse_input_descriptor_p, m, k, k, 16, input_type,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  // initialize dense descriptor
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, entry.dense_input_descriptor_p, (is_B_contiguous) ? k : n,
+      (is_B_contiguous) ? n : k, (is_B_contiguous) ? n : k, 16, input_type,
+      CUSPARSE_ORDER_ROW));
+
+  // initialize result descriptor
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&handle, entry.res_descriptor_p, m, n, n,
+                                    16, output_type, CUSPARSE_ORDER_ROW));
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, entry.C_descriptor_p, m, n, n, 16, C_type, CUSPARSE_ORDER_ROW));
+
+  entry.matmul_p = new cusparseLtMatmulDescriptor_t();
+  entry.plan_p = new cusparseLtMatmulPlan_t();
+  entry.alg_sel_p = new cusparseLtMatmulAlgSelection_t();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle, entry.matmul_p, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (is_B_contiguous) ? CUSPARSE_OPERATION_NON_TRANSPOSE
+                        : CUSPARSE_OPERATION_TRANSPOSE,
+      entry.sparse_input_descriptor_p, entry.dense_input_descriptor_p,
+      entry.C_descriptor_p, entry.res_descriptor_p, compute_type));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
+      &handle, entry.alg_sel_p, entry.matmul_p, CUSPARSELT_MATMUL_ALG_DEFAULT));
+  int num_search_iters = 5;
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
+      &handle, entry.alg_sel_p, CUSPARSELT_MATMUL_SEARCH_ITERATIONS,
+      &num_search_iters, sizeof(num_search_iters)));
+
+  // TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
+  //     &handle, &plan, &entry.matmul, &alg_sel));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
+      &handle, entry.plan_p, entry.matmul_p, entry.alg_sel_p));
+
+  size_t workspace_size;
+  // TORCH_CUDASPARSE_CHECK(
+  //     cusparseLtMatmulGetWorkspace(&handle, &global_plan, &workspace_size));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulGetWorkspace(&handle, entry.plan_p, &workspace_size));
+  AT_CUDA_CHECK(cudaMalloc((void**)&entry.workspace_ptr, workspace_size));
+}
+
 }  // namespace cusparseLt
 }  // namespace vllm
 
-// vllm::cusparseLt::cusparseLtEntry entry;
-
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
     vc::handle_initialized = true;
   }
-  // create sparse descriptor, dtype
+
+  cudaDataType type;
   auto compression_factor = 9;
   cusparseLtMatDescriptor_t input_descriptor;
-  cudaDataType type = CUDA_R_8F_E4M3;
+
+  switch (input.scalar_type()) {
+    case at::ScalarType::Char:
+      type = CUDA_R_8I;
+      compression_factor = 10;
+      break;
+    case at::ScalarType::Half:
+      type = CUDA_R_16F;
+      break;
+    case at::ScalarType::BFloat16:
+      type = CUDA_R_16BF;
+      break;
+    case at::ScalarType::Float:
+      type = CUDA_R_32F;
+      break;
+    case at::ScalarType::Float8_e4m3fn:
+      type = CUDA_R_8F_E4M3;
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix");
+      break;
+  }
+
   auto compressed_tensor =
       input.new_empty(input.numel() * compression_factor / 16);
 
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
-      16, type, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+      &vc::handle, &input_descriptor, input.size(0), input.size(1),
+      input.size(1), 16, type, CUSPARSE_ORDER_ROW,
+      CUSPARSELT_SPARSITY_50_PERCENT));
 
   size_t compressed_size, compressed_buffer_size;
   TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-      &vc::handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
+      &vc::handle, &input_descriptor, &compressed_size,
+      &compressed_buffer_size));
 
   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
@@ -131,169 +279,69 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   return compressed_tensor;
 }
 
-vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B, 
-                                            const c10::optional<torch::Tensor>& bias_opt) {
-  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress");
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<double>& alpha_opt,
+    const c10::optional<torch::Tensor>& bias_opt) {
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
     vc::handle_initialized = true;
   }
-  vc::cacheID id;
-  if (vc::cusparseLt_cache.empty()) {
-    id = 0;
-  } else {
-    id = vc::cusparseLt_cache.rbegin()->first + 1;
-  }
-
-  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
 
-  float alpha = 1.0;
-  float beta = 0.0;
-  cudaDataType input_type = CUDA_R_8F_E4M3;
-  cudaDataType output_type;
-  cudaDataType C_type;
-  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-  auto compression_factor = 9;
+  auto input_dtype = compressed_A.scalar_type();
   auto out_dtype = dense_B.scalar_type();
+  auto compression_factor = (input_dtype == at::ScalarType::Char) ? 10 : 9;
+
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
 
-  cusparseLtMatDescriptor_t sparse_input_descriptor;
-  cusparseLtMatDescriptor_t dense_input_descriptor;
-  cusparseLtMatDescriptor_t res_descriptor;
-  cusparseLtMatDescriptor_t C_descriptor;
-
-  switch (out_dtype) {
-    case at::ScalarType::Float8_e4m3fn:
-      output_type = CUDA_R_8F_E4M3;
-      C_type = CUDA_R_16F;
-      break;
-    case at::ScalarType::Half:
-      output_type = CUDA_R_16F;
-      C_type = CUDA_R_16F;
-      break;
-    case at::ScalarType::BFloat16:
-      output_type = CUDA_R_16BF;
-      C_type = CUDA_R_16BF;
-      break;
-    case at::ScalarType::Float:
-      output_type = CUDA_R_32F;
-      C_type = CUDA_R_32F;
-      break;
-    default:
-      TORCH_CHECK(false,
-                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
-                  "float32} for fp8 inputs");
-      break;
+  vc::cacheID tuple_id = std::make_tuple(m, k, n, input_dtype);
+  bool found = vc::cusparseLt_cache.count(tuple_id);
+  if (not found) {
+    vc::prepare_mm_semi_structured(tuple_id, out_dtype,
+                                   dense_B.is_contiguous());
   }
-  // initialize sparse descriptor
-  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &sparse_input_descriptor, m, k, k, 16, input_type,
-      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
-
-  // initialize dense descriptor
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &dense_input_descriptor,
-      (dense_B.is_contiguous()) ? k : n, (dense_B.is_contiguous()) ? n : k,
-      (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
-
-  // initialize result descriptor
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &res_descriptor, m, n, n, 16,
-      output_type,
-      CUSPARSE_ORDER_ROW));
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
-      CUSPARSE_ORDER_ROW));
+  auto& entry = vc::cusparseLt_cache[tuple_id];
 
-  cusparseLtMatmulPlan_t plan;
-  cusparseLtMatmulAlgSelection_t alg_sel;
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
-                                : CUSPARSE_OPERATION_TRANSPOSE,
-      &sparse_input_descriptor, &dense_input_descriptor,
-      &C_descriptor, &res_descriptor, compute_type));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
-      &vc::handle, &alg_sel, &entry.matmul,
-      CUSPARSELT_MATMUL_ALG_DEFAULT));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-      &vc::handle, &plan, &entry.matmul, &alg_sel));
-  size_t workspace_size;
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&vc::handle, &plan, &workspace_size));
-  AT_CUDA_CHECK(cudaMalloc((void**) &entry.workspace_ptr, workspace_size));
-
-  entry.device = dense_B.device();
-  entry.out_dtype = out_dtype;
-  entry.m = m;
-  entry.n = n;
-  entry.k = k;
-  entry.sparse_mat_ptr = compressed_A.data_ptr();
-  entry.dense_mat_ptr = dense_B.data_ptr();
-  entry.plan = plan;
-  entry.sparse_input_descriptor = sparse_input_descriptor;
-  entry.dense_input_descriptor = dense_input_descriptor;
-  entry.C_descriptor = C_descriptor;
-  entry.res_descriptor = res_descriptor;
-
-  return id;
-}
-
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(vllm::cusparseLt::cacheID id) {
-  namespace vc = vllm::cusparseLt;
-  TORCH_CHECK(vc::handle_initialized,
-              "Call of matmul with unintialized matmul");
-  if (vc::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  // set bias pointer for matmul, need to assign to get location
+  if (bias_opt.has_value()) {
+    auto& bias = bias_opt.value();
+    void* dBias = bias.data_ptr();
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+        &vc::handle, entry.matmul_p, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
+        sizeof(dBias)));
   }
-  const auto& entry = vc::cusparseLt_cache[id];
 
-  auto res_tensor_options =
-      c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
-  at::Tensor res = at::empty({entry.m, entry.n}, res_tensor_options);
-  float alpha = 1.0;
+  // float alpha = 1.0;
+  float alpha = alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
   float beta = 0.0;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmul(&vc::handle, &entry.plan, &alpha, entry.sparse_mat_ptr,
-                       entry.dense_mat_ptr, &beta, res.data_ptr(),
-                       res.data_ptr(), entry.workspace_ptr, &stream, 1));
+  auto alpha_ptr = &alpha;
 
-  return res;
-}
+  auto res_tensor_options =
+      c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+  at::Tensor res = at::empty({m, n}, res_tensor_options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
-  namespace vc = vllm::cusparseLt;
-  TORCH_CHECK(vc::handle_initialized,
-              "Call of destroy cusparseId with unintialized cusparseLt");
-  if (vc::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  if (found) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
+        dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
+        entry.workspace_ptr, &stream, 1));
+  } else {
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulSearch(
+        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
+        dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
+        entry.workspace_ptr, &stream, 1));
   }
-  auto& entry = vc::cusparseLt_cache[id];
-
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatDescriptorDestroy(&entry.dense_input_descriptor));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.C_descriptor));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.res_descriptor));
-  // Destroy plan
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&entry.plan));
-  AT_CUDA_CHECK(cudaFree(entry.workspace_ptr));
-  vc::cusparseLt_cache.erase(id);
+  return res;
 }
 
-torch::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_fp8_semi_structured2(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
+    const c10::optional<double>& alpha_opt,
+    const c10::optional<torch::Tensor>& bias_opt) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
@@ -307,9 +355,6 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
-  int tensor_alpha_mode = 0;
-  float alpha = 1.0;
-  float beta = 0.0;
   cudaDataType input_type = CUDA_R_8F_E4M3;
   cudaDataType output_type;
   cudaDataType C_type;
@@ -364,15 +409,13 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   at::Tensor res = at::empty({m, n}, res_tensor_options);
 
   cusparseLtMatDescriptor_t res_descriptor;
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &res_descriptor, m, n, n, 16,
-      output_type,
-      CUSPARSE_ORDER_ROW));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&vc::handle, &res_descriptor, m, n, n, 16,
+                                    output_type, CUSPARSE_ORDER_ROW));
 
   cusparseLtMatDescriptor_t C_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
-      CUSPARSE_ORDER_ROW));
+      &vc::handle, &C_descriptor, m, n, n, 16, C_type, CUSPARSE_ORDER_ROW));
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -390,20 +433,10 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         sizeof(dBias)));
   }
 
-  const auto alpha_tensor =
-      alpha_opt.has_value() ? *alpha_opt : torch::Tensor{};
+  float beta = 0.0;
+  const float alpha =
+      alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
   auto alpha_ptr = &alpha;
-  if (alpha_opt.has_value()) {
-    if (alpha_tensor.numel() == 1) {
-      alpha = alpha_tensor.item<float>();
-    } else {
-      tensor_alpha_mode = 1;
-      TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-          &handle, &matmul, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING,
-          &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
-      alpha_ptr = static_cast<float*>(alpha_tensor.data_ptr());
-    }
-  }
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
       &vc::handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
@@ -418,9 +451,10 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   auto workspace_ptr = allocator.allocate(workspace_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-      &vc::handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
-      &beta, res.data_ptr(), res.data_ptr(), workspace_ptr.get(), &stream, 1));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmul(&vc::handle, &plan, alpha_ptr, compressed_A.data_ptr(),
+                       dense_B.data_ptr(), &beta, res.data_ptr(),
+                       res.data_ptr(), workspace_ptr.get(), &stream, 1));
 
   // Destroy descriptors
   TORCH_CUDASPARSE_CHECK(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 1a272495c46b5..7e356954f8d04 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -329,21 +329,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "Tensor!? alpha, Tensor!? bias) -> Tensor");
+      "float!? scale, Tensor!? bias) -> Tensor");
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
 
   ops.def(
-      "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
-      "denseB, Tensor!? bias) -> int");
-  ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
-           &cslt_prepare_mm_fp8_semi_structured);
+      "cslt_mm_fp8_semi_structured2(Tensor! compressed_A, Tensor! denseB,"
+      "float!? scale, Tensor!? bias) -> Tensor");
+  ops.impl("cslt_mm_fp8_semi_structured2", torch::kCUDA,
+           &cslt_mm_fp8_semi_structured2);
 
-  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured_prepared", &cslt_mm_fp8_semi_structured_prepared);
-
-  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
-  ops.impl("cslt_fp8_semi_structured_destroy", &cslt_fp8_semi_structured_destroy);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 143253beffe91..c67bee2d74b22 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -8,10 +8,7 @@
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
     semi_structured_sparse_dense_gemm,
-    semi_structured_sparse_dense_gemm_scaled)
-from vllm._custom_ops import (semi_structured_fp8_prepare_mm,
-                              semi_structured_fp8_mm_prepared)
-
+    semi_structured_sparse_dense_gemm_scaled, semi_structured_sparse_dense_gemm2)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -68,8 +65,6 @@ def test_semi_structured_fp8_compress(size):
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
-    # if dtype is torch.int8:
-    #     pytest.skip("cusparse does not support sparse x non transposed dense")
     M, N, K = mnk
     A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
@@ -82,6 +77,12 @@ def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
         C = dense_matmul(A_pruned, B, dtype)
         torch.testing.assert_close(C, C_sparse)
 
+    # Verify cache
+    B = get_random_mat(K, N, dtype)
+    C = dense_matmul(A_pruned, B, dtype)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B)
+    torch.testing.assert_close(C, C_sparse)
+
 
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
@@ -98,63 +99,27 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     C = dense_matmul(A_pruned, B.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
 
+    # Verify cache
+    B = get_random_mat(N, K, dtype)
+    C = dense_matmul(A_pruned, B.t(), dtype)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
+    torch.testing.assert_close(C, C_sparse)
 
-# TODO modelopt config has to be replaced with corresponding fp8_24 config
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
-    M, N, K = (32, 64, 32)
-    dtype = torch.float8_e4m3fn
-    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
-    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
-    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
-
-    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
-    M, N, K = (32, 64, 32)
-    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
-    A_pruned_fp8, scale_A = to_float8(A_pruned)
-    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
-    B_fp8, scale_B = to_float8(B)
-
-    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
-
-    C = torch._scaled_mm(A_pruned_fp8,
-                         B_fp8,
-                         scale_a=scale_A,
-                         scale_b=scale_B,
-                         out_dtype=torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
-                                                        B_fp8,
-                                                        scale_a=scale_A,
-                                                        scale_b=scale_B).to(
-                                                            torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
     not is_semi_structured_supported()
     or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_matmul_prepared():
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul2():
     M, N, K = (32, 64, 32)
     dtype = torch.float8_e4m3fn
     A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
     B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
-    handle = semi_structured_fp8_prepare_mm(A.packed, B)
 
     C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_fp8_mm_prepared(int(handle)).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm2(A, B).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e6c4e2b2d7ac4..b7260c1a18da0 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -713,28 +713,23 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor,
-                           B_dense: torch.Tensor,
-                           bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    assert A_compressed.dtype == torch.float8_e4m3fn
+def semi_structured_fp8_mm(
+        A_compressed: torch.Tensor,
+        B_dense: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    bias)
+                                                    scale, bias)
 
 
-def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
-                                   B_dense: torch.Tensor,
-                                   bias: Optional[torch.Tensor] = None) -> int:
+def semi_structured_fp8_mm2(
+        A_compressed: torch.Tensor,
+        B_dense: torch.Tensor,
+        scale: Optional[float] = None,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
-        A_compressed, B_dense, bias)
-
-
-def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:
-    return torch.ops._C.cslt_mm_fp8_semi_structured_prepared(cacheId)
-
-
-def semi_structured_fp8_destroy(cacheId: int):
-    torch.ops._C.cslt_fp8_semi_structured_destroy(cacheId)
+    return torch.ops._C.cslt_mm_fp8_semi_structured2(A_compressed, B_dense,
+                                                     scale, bias)
 
 
 # int8
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 58af70318aac8..23cfd825e3448 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -9,7 +9,7 @@
 
 from vllm import _custom_ops as ops
 from vllm._custom_ops import (semi_structured_fp8_compress,
-                              semi_structured_fp8_mm)
+                              semi_structured_fp8_mm, semi_structured_fp8_mm2)
 from vllm.platforms import current_platform
 
 SparseSemiStructuredTensor._FORCE_CUTLASS = False
@@ -80,16 +80,24 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
     '''
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
-    if b_dense.is_contiguous() and a_packed.dtype in [torch.int8, torch.float8_e4m3fn]:
-        raise ValueError("cuSparseLt does not support contiguous dense matrix for int8 and fp8 types")
-    if a_packed.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(a_packed.packed,
-                                      b_dense,
-                                      bias=bias,
-                                      transpose_result=False)
-    else:
-        return torch.mm(a_packed, b_dense)
+    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
+    scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
+
+    # if a_sparse.dtype == torch.float8_e4m3fn:
+    #     scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    #     return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
+    # else:
+    #     return torch.mm(a_sparse, b_dense)
+
+
+def semi_structured_sparse_dense_gemm2(a_sparse: torch.Tensor,
+                                       b_dense: torch.Tensor):
+    assert a_sparse.dtype in [
+        torch.float8_e4m3fn
+    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
+    scale = 1.0
+    return semi_structured_fp8_mm2(a_sparse.packed, b_dense, scale=scale)
 
 
 def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,

From 1f6a05b6954bec4f205f0e8e8327a138aaf105bf Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 6 Nov 2024 15:17:27 +0000
Subject: [PATCH 35/39] Fixes and polishing after rebase

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 121 +++++++-------
 csrc/ops.h                                    |   4 +-
 .../fp8_semi_structured/cusparseLt.cpp        |  93 ++++++-----
 csrc/torch_bindings.cpp                       |   8 +-
 tests/kernels/test_semi_structured.py         | 155 +++++++++++++++++-
 vllm/_custom_ops.py                           |  17 +-
 .../sparsity/utils/cusparse_2_4_utils.py      | 143 ++++++++++------
 7 files changed, 358 insertions(+), 183 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index a3b013034e725..b66ef0fa7b29d 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -1,8 +1,6 @@
 import argparse
 import copy
 import itertools
-# import pickle as pkl
-# import time
 from typing import Callable, Iterable, List, Tuple
 
 import torch
@@ -12,7 +10,7 @@
 
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
-    is_semi_structured_supported, semi_structured_sparse_dense_gemm2)
+    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -69,43 +67,54 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
                  semi_structured_sparse_dense_gemm,
                  compress_to_torch_sparse_semi_structured_mat(a), b))
 
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_fp16_fp16_2_4_noncached",
+                 semi_structured_sparse_dense_gemm,
+                 compress_to_torch_sparse_semi_structured_mat(a),
+                 b,
+                 cached=False))
+
     # cusparseLt bf16
+    a, b = make_rand_tensors(torch.bfloat16, m, n, k)
+    a_compressed = compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16))
+
     timers.append(
-        bench_fn(
-            label, sub_label, "cusparseLt_bf16_bf16_2_4",
-            semi_structured_sparse_dense_gemm,
-            compress_to_torch_sparse_semi_structured_mat(
-                a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16)))
-
-    # a_compressed = compress_to_torch_sparse_semi_structured_mat(
-    #         a.to(dtype=torch.bfloat16))
-    # b = b.to(torch.bfloat16)
-    a, b = make_rand_tensors(torch.float16, m, n, k)
-    a_compressed = compress_to_torch_sparse_semi_structured_mat(
-        a.to(dtype=torch.bfloat16))
-    # warmup
-    semi_structured_sparse_dense_gemm2(a_compressed, b)
+        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4",
+                 semi_structured_sparse_dense_gemm, a_compressed, b))
+
     timers.append(
-        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4_v2",
-                 semi_structured_sparse_dense_gemm2, a_compressed, b))
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_bf16_bf16_2_4_noncached",
+                 semi_structured_sparse_dense_gemm,
+                 a_compressed,
+                 b,
+                 cached=False))
 
-    # a, b = make_rand_tensors(torch.int8, m, n, k)
+    a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
-    # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
-    #              dense_matmul, a, b, torch.int8))
-    # a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
-    # cusparseLt i8
-    # timers.append(
-    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
-    #              semi_structured_sparse_dense_gemm,
-    #              compress_to_torch_sparse_semi_structured_mat(a), b))
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_matmul", dense_matmul, a, b,
+                 torch.int8))
 
+    # cusparseLt i8
+    a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
     # warmup
-    # semi_structured_sparse_dense_gemm2(a_compressed, b)
-    # timers.append(
-    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_v2",
-    #              semi_structured_sparse_dense_gemm2, a_compressed, b))
+    semi_structured_sparse_dense_gemm(a_compressed, b)
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
+                 semi_structured_sparse_dense_gemm, a_compressed, b))
+
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_i8_i8_2_4_noncached",
+                 semi_structured_sparse_dense_gemm,
+                 a_compressed,
+                 b,
+                 cached=False))
 
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
@@ -114,27 +123,25 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
                      dense_matmul, a, b, torch.float8_e4m3fn))
 
-        a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         # cusparseLt fp8
-        # timers.append(
-        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-        #              semi_structured_sparse_dense_gemm,
-        #              a_compressed, b))
+        a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
 
         # warmup
-        semi_structured_sparse_dense_gemm2(a_compressed, b)
+        semi_structured_sparse_dense_gemm(a_compressed, b)
+
         timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_v2",
-                     semi_structured_sparse_dense_gemm2, a_compressed, b))
-
-        # handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
-        # id = int(handle)
-        # scale = torch.tensor(1.0, device='cuda', dtype=torch.float32)
-        # # scale = None
-        # timers.append(
-        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
-        #              semi_structured_fp8_mm_prepared, id, scale=scale))
-        # semi_structured_fp8_destroy(id)
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+                     semi_structured_sparse_dense_gemm, a_compressed, b))
+
+        timers.append(
+            bench_fn(label,
+                     sub_label,
+                     "cusparseLt_fp8_fp8_2_4_noncached",
+                     semi_structured_sparse_dense_gemm,
+                     a_compressed,
+                     b,
+                     cached=False))
+
     return timers
 
 
@@ -168,11 +175,6 @@ def make_output(data: Iterable[TMeasurement],
     print(f"== All Results {base_description} ====")
     print_timers(data)
 
-    # pickle all the results
-    # timestamp = int(time.time()) if timestamp is None else timestamp
-    # with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-    #     pkl.dump(data, f)
-
 
 def run_model_bench(args):
     if not is_semi_structured_supported():
@@ -211,15 +213,6 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
         print_timers(data)
 
-    # timestamp = int(time.time())
-
-    # all_data = []
-    # for d in model_bench_data:
-    #     all_data.extend(d)
-    # pickle all data
-    # with open(f"model_bench-{timestamp}.pkl", "wb") as f:
-    #     pkl.dump(all_data, f)
-
 
 if __name__ == '__main__':
 
diff --git a/csrc/ops.h b/csrc/ops.h
index cc18915d37762..caac8c60e279c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -226,7 +226,7 @@ void register_graph_buffers(fptr_t _fa,
 #ifndef USE_ROCM
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
-torch::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<double>& scale_opt,
     const c10::optional<torch::Tensor>& bias_opt);
@@ -236,4 +236,6 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
     const c10::optional<double>& scale_opt,
     const c10::optional<torch::Tensor>& bias_opt);
 
+void cslt_clear_cache();
+
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index eafc4ac72736c..54fbc81345449 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -3,33 +3,25 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#define STUB_FUNC_IMPL()                                                     \
-  torch::Tensor cslt_compress_fp8_semi_structured(                           \
-      const torch::Tensor& input) {                                          \
-  torch::Tensor cslt_compress_fp8_semi_structured(                           \
-      const torch::Tensor& input) {                                          \
-    TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
-                                                                             \
-  torch::Tensor cslt_mm_fp8_semi_structured(                                 \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-      const c10::optional<torch::Tensor>& scale_opt,                         \
-      const c10::optional<torch::Tensor>& bias_opt) {                        \
-    TORCH_CHECK(false,                                                       \
-                "Unsupported dtype for compressed matrix multiplication in " \
-                "current version of cuSPARSELt.");                           \
-  }                                                                          \
-  torch::Tensor cslt_mm_fp8_semi_structured2(                                \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-      const c10::optional<torch::Tensor>& scale_opt,                         \
-      const c10::optional<torch::Tensor>& bias_opt) {                        \
-    TORCH_CHECK(false,                                                       \
-                "Unsupported dtype for compressed matrix multiplication in " \
-                "current version of cuSPARSELt.");                           \
-  }
+#define STUB_FUNC_IMPL()                                               \
+  torch::Tensor cslt_compress_fp8_semi_structured(                     \
+      const torch::Tensor& input) {                                    \
+    TORCH_CHECK(false, "cusparseLt is not found");                     \
+  }                                                                    \
+                                                                       \
+  torch::Tensor cslt_mm_semi_structured(                               \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
+      const c10::optional<torch::Tensor>& scale_opt,                   \
+      const c10::optional<torch::Tensor>& bias_opt) {                  \
+    TORCH_CHECK(false, "cusparseLt is not found");                     \
+  }                                                                    \
+  torch::Tensor cslt_mm_fp8_semi_structured2(                          \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
+      const c10::optional<torch::Tensor>& scale_opt,                   \
+      const c10::optional<torch::Tensor>& bias_opt) {                  \
+    TORCH_CHECK(false, "cusparseLt is not found");                     \
+  }                                                                    \
+  void cslt_clear_cache() { TORCH_CHECK(false, "cusparseLt is not found"); }
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -206,14 +198,10 @@ void prepare_mm_semi_structured(const cacheID& tuple_id,
       &handle, entry.alg_sel_p, CUSPARSELT_MATMUL_SEARCH_ITERATIONS,
       &num_search_iters, sizeof(num_search_iters)));
 
-  // TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-  //     &handle, &plan, &entry.matmul, &alg_sel));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
       &handle, entry.plan_p, entry.matmul_p, entry.alg_sel_p));
 
   size_t workspace_size;
-  // TORCH_CUDASPARSE_CHECK(
-  //     cusparseLtMatmulGetWorkspace(&handle, &global_plan, &workspace_size));
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatmulGetWorkspace(&handle, entry.plan_p, &workspace_size));
   AT_CUDA_CHECK(cudaMalloc((void**)&entry.workspace_ptr, workspace_size));
@@ -279,7 +267,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   return compressed_tensor;
 }
 
-torch::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<double>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt) {
@@ -342,8 +330,6 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<double>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt) {
-  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
@@ -355,40 +341,54 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
-  cudaDataType input_type = CUDA_R_8F_E4M3;
+  cudaDataType input_type;
   cudaDataType output_type;
   cudaDataType C_type;
-  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  cusparseComputeType compute_type;
   auto compression_factor = 9;
-  auto out_dtype = dense_B.scalar_type();
-
-  switch (out_dtype) {
-    case at::ScalarType::Float8_e4m3fn:
-      output_type = CUDA_R_8F_E4M3;
-      C_type = CUDA_R_16F;
+  switch (compressed_A.scalar_type()) {
+    case at::ScalarType::Char:
+      input_type = CUDA_R_8I;
+      output_type = CUDA_R_8I;
+      C_type = CUDA_R_8I;
+      compute_type = CUSPARSE_COMPUTE_32I;
+      compression_factor = 10;
       break;
     case at::ScalarType::Half:
+      input_type = CUDA_R_16F;
       output_type = CUDA_R_16F;
       C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
       break;
     case at::ScalarType::BFloat16:
+      input_type = CUDA_R_16BF;
       output_type = CUDA_R_16BF;
       C_type = CUDA_R_16BF;
+      compute_type = CUSPARSE_COMPUTE_32F;
       break;
     case at::ScalarType::Float:
+      input_type = CUDA_R_32F;
       output_type = CUDA_R_32F;
       C_type = CUDA_R_32F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::Float8_e4m3fn:
+      input_type = CUDA_R_8F_E4M3;
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
       break;
     default:
-      TORCH_CHECK(false,
-                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
-                  "float32} for fp8 inputs");
+      TORCH_CHECK(
+          false,
+          "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
       break;
   }
 
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+  auto out_dtype = dense_B.scalar_type();
 
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
@@ -467,6 +467,9 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
   return res;
 }
+
+void cslt_clear_cache() { vllm::cusparseLt::cusparseLt_cache.clear(); }
+
   #else
 
 STUB_FUNC_IMPL()
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 7e356954f8d04..99d97547b85b2 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -328,10 +328,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
            &cslt_compress_fp8_semi_structured);
 
   ops.def(
-      "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
+      "cslt_mm_semi_structured(Tensor! compressed_A, Tensor! denseB,"
       "float!? scale, Tensor!? bias) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
-           &cslt_mm_fp8_semi_structured);
+  ops.impl("cslt_mm_semi_structured", torch::kCUDA, &cslt_mm_semi_structured);
 
   ops.def(
       "cslt_mm_fp8_semi_structured2(Tensor! compressed_A, Tensor! denseB,"
@@ -339,6 +338,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("cslt_mm_fp8_semi_structured2", torch::kCUDA,
            &cslt_mm_fp8_semi_structured2);
 
+  ops.def("cslt_clear_cache() -> ()");
+  ops.impl("cslt_clear_cache", &cslt_clear_cache);
+
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c67bee2d74b22..e107630979250 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -7,8 +7,10 @@
     decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
+    semi_structured_dense_sparse_T_gemm_scaled,
     semi_structured_sparse_dense_gemm,
-    semi_structured_sparse_dense_gemm_scaled, semi_structured_sparse_dense_gemm2)
+    semi_structured_sparse_dense_gemm_scaled,
+    clear_cache)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -70,18 +72,22 @@ def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
     B = get_random_mat(K, N, dtype)
     if dtype is torch.int8:
-        with pytest.raises(ValueError) as e:
+        with pytest.raises(ValueError):
             C_sparse = semi_structured_sparse_dense_gemm(A, B)
     else:
         C_sparse = semi_structured_sparse_dense_gemm(A, B)
         C = dense_matmul(A_pruned, B, dtype)
         torch.testing.assert_close(C, C_sparse)
 
-    # Verify cache
-    B = get_random_mat(K, N, dtype)
-    C = dense_matmul(A_pruned, B, dtype)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B)
-    torch.testing.assert_close(C, C_sparse)
+        # Verify cache
+        B = get_random_mat(K, N, dtype)
+        C = dense_matmul(A_pruned, B, dtype)
+        C_sparse = semi_structured_sparse_dense_gemm(A, B)
+        torch.testing.assert_close(C, C_sparse)
+
+        C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False)
+        torch.testing.assert_close(C, C_sparse)
+        clear_cache()
 
 
 @pytest.mark.skipif(
@@ -105,13 +111,17 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
     torch.testing.assert_close(C, C_sparse)
 
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t(), cached=False)
+    torch.testing.assert_close(C, C_sparse)
+    clear_cache()
+
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
     not is_semi_structured_supported()
     or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_matmul2():
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     M, N, K = (32, 64, 32)
     dtype = torch.float8_e4m3fn
     A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
@@ -119,9 +129,21 @@ def test_torch_semi_structured_sparse_dense_T_fp8_matmul2():
     B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
 
     C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm2(A, B).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    # Cached version
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
+    # Noncached version
+    C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False).to(
+        torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
+
 
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
@@ -138,6 +160,11 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
     C = dense_matmul(A, B_T_pruned.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
 
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T, cached=False)
+    C = dense_matmul(A, B_T_pruned.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
+    clear_cache()
+
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
@@ -154,3 +181,113 @@ def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
     C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
     C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
+    A_pruned_fp8, scale_A = to_float8(A_pruned)
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+    # cached
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+    # noncached
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B,
+                                                        cached=False).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_dense_sparse_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A = torch.rand((M, K), device='cuda', dtype=torch.float16)
+    A_fp8, scale_a = to_float8(A)
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=torch.float16)
+    B_T_pruned_fp8, scale_b = to_float8(B_T_pruned)
+    B_T_packed = compress_to_torch_sparse_semi_structured_mat(B_T_pruned_fp8)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm_scaled(A_fp8,
+                                                          B_T_packed,
+                                                          scale_a=scale_a,
+                                                          scale_b=scale_b).to(
+                                                              torch.float32)
+    C = torch._scaled_mm(B_T_pruned_fp8,
+                         A_fp8.t(),
+                         scale_a=scale_b,
+                         scale_b=scale_a,
+                         out_dtype=torch.float32).t()
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul():
+    dtype = torch.int8
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(N, K, dtype)
+
+    scale_a = torch.tensor(2.0, dtype=torch.float32, device='cuda')
+    scale_b = torch.tensor(2.0, dtype=torch.float32, device='cuda')
+
+    C = dense_matmul(A_pruned,
+                     B.t(),
+                     dtype=dtype,
+                     scale_a=scale_a,
+                     scale_b=scale_b).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A,
+                                                        B.t(),
+                                                        scale_a=scale_a,
+                                                        scale_b=scale_b).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b7260c1a18da0..d5c04d94e6d50 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -713,13 +713,12 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
 
-def semi_structured_fp8_mm(
-        A_compressed: torch.Tensor,
-        B_dense: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    scale, bias)
+def semi_structured_mm(A_compressed: torch.Tensor,
+                       B_dense: torch.Tensor,
+                       scale: Optional[float] = None,
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    return torch.ops._C.cslt_mm_semi_structured(A_compressed, B_dense, scale,
+                                                bias)
 
 
 def semi_structured_fp8_mm2(
@@ -727,10 +726,12 @@ def semi_structured_fp8_mm2(
         B_dense: torch.Tensor,
         scale: Optional[float] = None,
         bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_mm_fp8_semi_structured2(A_compressed, B_dense,
                                                      scale, bias)
 
+def semi_structured_clear_cache() -> None:
+    return torch.ops._C.cslt_clear_cache()
+
 
 # int8
 def scaled_int8_quant(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 23cfd825e3448..f98631ab04a49 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -3,17 +3,14 @@
 from torch.sparse import (SparseSemiStructuredTensor,
                           SparseSemiStructuredTensorCUSPARSELT,
                           to_sparse_semi_structured)
-from torch.sparse import (SparseSemiStructuredTensor,
-                          SparseSemiStructuredTensorCUSPARSELT,
-                          to_sparse_semi_structured)
 
-from vllm import _custom_ops as ops
-from vllm._custom_ops import (semi_structured_fp8_compress,
-                              semi_structured_fp8_mm, semi_structured_fp8_mm2)
+from vllm._custom_ops import (cutlass_scaled_mm, semi_structured_fp8_compress,
+                              semi_structured_fp8_mm2, semi_structured_mm, semi_structured_clear_cache)
 from vllm.platforms import current_platform
 
 SparseSemiStructuredTensor._FORCE_CUTLASS = False
 
+
 def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
     '''
     Compresses original pruned (with zeros) tensor into packed version
@@ -21,8 +18,8 @@ def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
         pruned_tensor(torch.Tensor) - pruned but not packed tensor
     Returns: 
         torch.SparseSemiStructuredTensorCUSPARSELT: torch wrapped cusparseLt-packed tensor. 
-    '''
-    
+    ''' # noqa: E501
+
     if pruned_tensor.dtype == torch.float8_e4m3fn:
         packed = semi_structured_fp8_compress(pruned_tensor)
         return SparseSemiStructuredTensorCUSPARSELT(
@@ -48,13 +45,13 @@ def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
         packed_tensor - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
     Returns:
         pruned (torch.Tensor) - pruned torch.tensor
-    '''
+    ''' # noqa: E501
     if packed_tensor.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(packed_tensor.packed,
-                                      torch.eye(packed_tensor.shape[-1],
-                                                dtype=packed_tensor.dtype,
-                                                device=packed_tensor.device).t(),
-                                      transpose_result=False)
+        return semi_structured_mm(
+            packed_tensor.packed,
+            torch.eye(packed_tensor.shape[-1],
+                      dtype=packed_tensor.dtype,
+                      device=packed_tensor.device).t())
     else:
         # Fix of to_dense() function supporting int8
         # cuSparseLT for int8 requires dense matrix to be non-contiguous
@@ -67,60 +64,68 @@ def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
 
 def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
                                       b_dense: torch.Tensor,
-                                      bias: torch.Tensor = None):
+                                      bias: torch.Tensor = None,
+                                      cached: bool = True):
     '''
     Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices.
     In case of int8 and fp8 types, dense matrix B has to be non-contiguous.
     Args:
         a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
         b_dense (torch.Tensor) - dense matrix tensor.
-        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None. 
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
+
     Result:
         torch.Tensor - Result of matrix multiplication.
-    '''
+    ''' # noqa: E501
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
-    scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
-
-    # if a_sparse.dtype == torch.float8_e4m3fn:
-    #     scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    #     return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
-    # else:
-    #     return torch.mm(a_sparse, b_dense)
-
-
-def semi_structured_sparse_dense_gemm2(a_sparse: torch.Tensor,
-                                       b_dense: torch.Tensor):
-    assert a_sparse.dtype in [
-        torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
-    scale = 1.0
-    return semi_structured_fp8_mm2(a_sparse.packed, b_dense, scale=scale)
+    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
+    if b_dense.is_contiguous() and a_packed.dtype in [
+            torch.int8, torch.float8_e4m3fn
+    ]:
+        raise ValueError("cuSparseLt does not support"
+                         "contiguous dense matrix for int8 and fp8 types")
+
+    if cached:
+        return semi_structured_mm(a_packed.packed, b_dense, bias=bias)
+    else:
+        if a_packed.dtype == torch.float8_e4m3fn:
+            return semi_structured_fp8_mm2(a_packed.packed, b_dense, bias=bias)
+        else:
+            result = torch.mm(a_packed, b_dense)
+            if bias is not None:
+                result = torch.add(result, bias)
+            return result
 
 
 def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
                                         b_T_packed: torch.Tensor,
-                                        bias: torch.Tensor = None):
+                                        bias: torch.Tensor = None,
+                                        cached: bool = True):
     '''
     Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
     Args:
         a_dense (torch.Tensor) - dense matrix tensor.
         b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
     
     Returns:
         torch.Tensor - Result of matrix multiplication.
-    '''
-    return (semi_structured_sparse_dense_gemm(b_T_packed, a_dense.t(), bias)).t()
+    ''' # noqa: E501
+    return (semi_structured_sparse_dense_gemm(b_T_packed,
+                                              a_dense.t(),
+                                              bias=bias,
+                                              cached=cached)).t()
 
 
 def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
                                              b_dense: torch.Tensor,
                                              scale_a: torch.Tensor,
                                              scale_b: torch.Tensor,
-                                             bias: torch.Tensor = None):
+                                             bias: torch.Tensor = None,
+                                             cached: bool = False):
     '''
     Performs scaled matrix multiplication (a @ b) of transposed semi-structured sparse and dense fp8 matrices
     Args:
@@ -129,31 +134,63 @@ def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
         scale_a (torch.Tensor) - scaling factor for sparse matrix, must be in float32.
         scale_b (torch.Tensor) - scaling factor for dense matrix, must be in float32.
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
 
     Returns:
         torch.Tensor - Result of matrix multiplication.
-    '''
+    ''' # noqa: E501
 
-    assert (a_packed.dtype == torch.float8_e4m3fn
-            and b_dense.dtype == torch.float8_e4m3fn)
-    assert not b_dense.is_contiguous(
-    ), "cusparseLt requires dense matrix be non-contiguous"
     # cusparseLt requires alpha to be float
     assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
-    return semi_structured_fp8_mm(a_packed.packed,
+    scale = (scale_a * scale_b).item()
+    if cached:
+        return semi_structured_mm(a_packed.packed,
                                   b_dense,
-                                  alpha=scale_a * scale_b,
-                                  bias=bias,
-                                  transpose_result=False)
+                                  scale=scale,
+                                  bias=bias)
+    else:
+        return semi_structured_fp8_mm2(a_packed.packed,
+                                       b_dense,
+                                       bias=bias,
+                                       scale=scale)
+
+
+def semi_structured_dense_sparse_T_gemm_scaled(a_dense: torch.Tensor,
+                                               b_T_packed: torch.Tensor,
+                                               scale_a: torch.Tensor = None,
+                                               scale_b: torch.Tensor = None,
+                                               bias: torch.Tensor = None,
+                                               cached: bool = True):
+    '''
+    Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
+    Args:
+        a_dense (torch.Tensor) - dense matrix tensor.
+        b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached(faster) version of cusparseLt wrapper.
+    
+    Returns:
+        torch.Tensor - Result of matrix multiplication.
+    '''  # noqa: E501
+    return (semi_structured_sparse_dense_gemm_scaled(b_T_packed,
+                                                     a_dense.t(),
+                                                     scale_a=scale_b,
+                                                     scale_b=scale_a,
+                                                     bias=bias,
+                                                     cached=cached)).t()
 
+def clear_cache():
+    semi_structured_clear_cache()
 
 # test utils
-def dense_matmul(A, B, dtype):
+def dense_matmul(A, B, dtype, scale_a=None, scale_b=None):
     if dtype in [torch.int8, torch.float8_e4m3fn]:
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
-                                     torch.bfloat16).to(dtype)
+        if scale_a is None:
+            scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        if scale_b is None:
+            scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return cutlass_scaled_mm(A, B, scale_a, scale_b,
+                                 torch.bfloat16).to(dtype)
     else:
         return A @ B
 

From 3a2c258f18f0827f7133ab49651f6aab45b25808 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Tue, 12 Nov 2024 10:25:45 +0000
Subject: [PATCH 36/39] Add output_dtype option, fix non-padded inputs case

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 106 ++++----
 csrc/ops.h                                    |  12 +-
 .../fp8_semi_structured/cusparseLt.cpp        | 229 +++++++++++++-----
 csrc/torch_bindings.cpp                       |   9 +-
 tests/kernels/test_semi_structured.py         |  83 ++++---
 vllm/_custom_ops.py                           |  22 +-
 .../sparsity/utils/cusparse_2_4_utils.py      | 157 +++++++++---
 7 files changed, 437 insertions(+), 181 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index b66ef0fa7b29d..594db20cb5fb5 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -10,7 +10,8 @@
 
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
-    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
+    is_semi_structured_supported, semi_structured_sparse_dense_gemm,
+    semi_structured_sparse_dense_gemm_scaled)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -51,57 +52,64 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
 
     timers = []
     # pytorch float16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_fp16_fp16_matmul", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # pytorch bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul", torch.mm,
-                 a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # cusparseLt fp16
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_fp16_fp16_2_4",
-                 semi_structured_sparse_dense_gemm,
-                 compress_to_torch_sparse_semi_structured_mat(a), b))
-
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_fp16_fp16_2_4_noncached",
-                 semi_structured_sparse_dense_gemm,
-                 compress_to_torch_sparse_semi_structured_mat(a),
-                 b,
-                 cached=False))
-
-    # cusparseLt bf16
-    a, b = make_rand_tensors(torch.bfloat16, m, n, k)
-    a_compressed = compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16))
-
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4",
-                 semi_structured_sparse_dense_gemm, a_compressed, b))
-
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_bf16_bf16_2_4_noncached",
-                 semi_structured_sparse_dense_gemm,
-                 a_compressed,
-                 b,
-                 cached=False))
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_fp16_fp16_matmul", torch.mm,
+    #              a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # # pytorch bf16
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul", torch.mm,
+    #              a.to(dtype=torch.bfloat16, device="cuda"),
+    #              b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # cusparseLt fp16
+    # timers.append(
+    #     bench_fn(label, sub_label, "cusparseLt_fp16_fp16_2_4",
+    #              semi_structured_sparse_dense_gemm,
+    #              compress_to_torch_sparse_semi_structured_mat(a), b))
+
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "cusparseLt_fp16_fp16_2_4_noncached",
+    #              semi_structured_sparse_dense_gemm,
+    #              compress_to_torch_sparse_semi_structured_mat(a),
+    #              b,
+    #              cached=False))
+
+    # # cusparseLt bf16
+    # a, b = make_rand_tensors(torch.bfloat16, m, n, k)
+    # a_compressed = compress_to_torch_sparse_semi_structured_mat(
+    #     a.to(dtype=torch.bfloat16))
+
+    # timers.append(
+    #     bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4",
+    #              semi_structured_sparse_dense_gemm, a_compressed, b))
+
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "cusparseLt_bf16_bf16_2_4_noncached",
+    #              semi_structured_sparse_dense_gemm,
+    #              a_compressed,
+    #              b,
+    #              cached=False))
 
     a, b = make_rand_tensors(torch.int8, m, n, k)
-    # cutlass i8
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_matmul", dense_matmul, a, b,
-                 torch.int8))
+    # # cutlass i8
+    # timers.append(
+    #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul", dense_matmul, a, b,
+    #              torch.int8))
 
     # cusparseLt i8
     a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
     # warmup
+    scale = torch.tensor(1.0, dtype=torch.float32, device='cuda')
+    semi_structured_sparse_dense_gemm_scaled(a_compressed,
+                                             b,
+                                             scale_a=scale,
+                                             scale_b=scale)
+
     semi_structured_sparse_dense_gemm(a_compressed, b)
     timers.append(
         bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
@@ -133,6 +141,12 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
                      semi_structured_sparse_dense_gemm, a_compressed, b))
 
+        semi_structured_sparse_dense_gemm_scaled(a_compressed, b, scale, scale)
+        timers.append(
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_scale",
+                     semi_structured_sparse_dense_gemm_scaled, a_compressed, b,
+                     scale, scale))
+
         timers.append(
             bench_fn(label,
                      sub_label,
diff --git a/csrc/ops.h b/csrc/ops.h
index caac8c60e279c..3e9f9c0981978 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -228,13 +228,15 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
 torch::Tensor cslt_mm_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& scale_opt,
-    const c10::optional<torch::Tensor>& bias_opt);
+    const c10::optional<torch::Tensor>& scale_opt,
+    const c10::optional<torch::Tensor>& bias_opt,
+    const std::optional<torch::ScalarType> out_dtype_opt);
 
-torch::Tensor cslt_mm_fp8_semi_structured2(
+torch::Tensor cslt_mm_semi_structured2(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& scale_opt,
-    const c10::optional<torch::Tensor>& bias_opt);
+    const c10::optional<torch::Tensor>& scale_opt,
+    const c10::optional<torch::Tensor>& bias_opt,
+    const std::optional<torch::ScalarType> out_dtype_opt);
 
 void cslt_clear_cache();
 
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 54fbc81345449..d01b07d14d387 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -12,13 +12,15 @@
   torch::Tensor cslt_mm_semi_structured(                               \
       const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
       const c10::optional<torch::Tensor>& scale_opt,                   \
-      const c10::optional<torch::Tensor>& bias_opt) {                  \
+      const c10::optional<torch::Tensor>& bias_opt,                    \
+      const std::optional<torch::ScalarType> out_dtype_opt) {          \
     TORCH_CHECK(false, "cusparseLt is not found");                     \
   }                                                                    \
-  torch::Tensor cslt_mm_fp8_semi_structured2(                          \
+  torch::Tensor cslt_mm_semi_structured2(                              \
       const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
       const c10::optional<torch::Tensor>& scale_opt,                   \
-      const c10::optional<torch::Tensor>& bias_opt) {                  \
+      const c10::optional<torch::Tensor>& bias_opt,                    \
+      const std::optional<torch::ScalarType> out_dtype_opt) {          \
     TORCH_CHECK(false, "cusparseLt is not found");                     \
   }                                                                    \
   void cslt_clear_cache() { TORCH_CHECK(false, "cusparseLt is not found"); }
@@ -75,17 +77,22 @@ struct cusparseLtEntry {
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
-using cacheID = std::tuple<int64_t, int64_t, int64_t, at::ScalarType>;
+
+// m, k, n, input_type, output_type, bias enabled, scale enabled, is B
+// contiguous
+using cacheID = std::tuple<int64_t, int64_t, int64_t, at::ScalarType,
+                           at::ScalarType, int, int, int>;
 
 std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
 
-void prepare_mm_semi_structured(const cacheID& tuple_id,
-                                at::ScalarType out_dtype,
-                                bool is_B_contiguous) {
+void prepare_mm_semi_structured(const cacheID& tuple_id) {
   auto m = std::get<0>(tuple_id);
   auto k = std::get<1>(tuple_id);
   auto n = std::get<2>(tuple_id);
-  at::ScalarType input_dtype = std::get<3>(tuple_id);
+  auto input_dtype = std::get<3>(tuple_id);
+  auto out_dtype = std::get<4>(tuple_id);
+  bool is_B_contiguous = std::get<7>(tuple_id);
+
   auto& entry = cusparseLt_cache[tuple_id];
 
   cudaDataType input_type;
@@ -131,33 +138,56 @@ void prepare_mm_semi_structured(const cacheID& tuple_id,
       break;
   }
 
-  // cudaDataType input_type = CUDA_R_8F_E4M3;
-  // cudaDataType output_type;
-  // cudaDataType C_type;
-  // cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-  // switch (out_dtype) {
-  //   case at::ScalarType::Float8_e4m3fn:
-  //     output_type = CUDA_R_8F_E4M3;
-  //     C_type = CUDA_R_16F;
-  //     break;
-  //   case at::ScalarType::Half:
-  //     output_type = CUDA_R_16F;
-  //     C_type = CUDA_R_16F;
-  //     break;
-  //   case at::ScalarType::BFloat16:
-  //     output_type = CUDA_R_16BF;
-  //     C_type = CUDA_R_16BF;
-  //     break;
-  //   case at::ScalarType::Float:
-  //     output_type = CUDA_R_32F;
-  //     C_type = CUDA_R_32F;
-  //     break;
-  //   default:
-  //     TORCH_CHECK(false,
-  //                 "Unsupported out_dtype passed, must be one of {fp16, bf16,
-  //                 " "float32} for fp8 inputs");
-  //     break;
-  // }
+  if (input_type == CUDA_R_8I) {
+    switch (out_dtype) {
+      case at::ScalarType::Char:
+        output_type = CUDA_R_8I;
+        C_type = CUDA_R_8I;
+        break;
+      case at::ScalarType::Half:
+        C_type = CUDA_R_16F;
+        output_type = CUDA_R_16F;
+        break;
+      case at::ScalarType::BFloat16:
+        C_type = CUDA_R_16BF;
+        output_type = CUDA_R_16BF;
+        break;
+      case at::ScalarType::Int:
+        C_type = CUDA_R_32I;
+        output_type = CUDA_R_32I;
+        break;
+      default:
+        TORCH_CHECK(false,
+                    "Unsupported out_dtype passed, must be one of {fp16, bf16, "
+                    "int32} for int8 inputs");
+        break;
+    }
+  } else if (input_type == CUDA_R_8F_E4M3) {
+    switch (out_dtype) {
+      case at::ScalarType::Float8_e4m3fn:
+        output_type = CUDA_R_8F_E4M3;
+        C_type = CUDA_R_16F;
+        break;
+      case at::ScalarType::Half:
+        output_type = CUDA_R_16F;
+        C_type = CUDA_R_16F;
+        break;
+      case at::ScalarType::BFloat16:
+        output_type = CUDA_R_16BF;
+        C_type = CUDA_R_16BF;
+        break;
+      case at::ScalarType::Float:
+        output_type = CUDA_R_32F;
+        C_type = CUDA_R_32F;
+        break;
+      default:
+        TORCH_CHECK(false,
+                    "Unsupported out_dtype passed, must be one of {fp16, bf16, "
+                    "float32} for fp8 inputs");
+        break;
+    }
+  }
+
   entry.sparse_input_descriptor_p = new cusparseLtMatDescriptor_t();
   entry.dense_input_descriptor_p = new cusparseLtMatDescriptor_t();
   entry.res_descriptor_p = new cusparseLtMatDescriptor_t();
@@ -269,8 +299,9 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
 
 torch::Tensor cslt_mm_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt) {
+    const c10::optional<torch::Tensor>& scale_opt,
+    const c10::optional<torch::Tensor>& bias_opt,
+    const std::optional<torch::ScalarType> out_dtype_opt) {
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
@@ -278,18 +309,25 @@ torch::Tensor cslt_mm_semi_structured(
   }
 
   auto input_dtype = compressed_A.scalar_type();
-  auto out_dtype = dense_B.scalar_type();
+  if (out_dtype_opt.has_value()) {
+    TORCH_CHECK(dense_B.scalar_type() == at::ScalarType::Char or
+                    dense_B.scalar_type() == at::ScalarType::Float8_e4m3fn,
+                "out_dtype support only available for int8/fp8 inputs")
+  }
+  auto out_dtype =
+      out_dtype_opt.has_value() ? *out_dtype_opt : dense_B.scalar_type();
   auto compression_factor = (input_dtype == at::ScalarType::Char) ? 10 : 9;
 
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
 
-  vc::cacheID tuple_id = std::make_tuple(m, k, n, input_dtype);
+  vc::cacheID tuple_id =
+      std::make_tuple(m, k, n, input_dtype, out_dtype, bias_opt.has_value(),
+                      scale_opt.has_value(), dense_B.is_contiguous());
   bool found = vc::cusparseLt_cache.count(tuple_id);
   if (not found) {
-    vc::prepare_mm_semi_structured(tuple_id, out_dtype,
-                                   dense_B.is_contiguous());
+    vc::prepare_mm_semi_structured(tuple_id);
   }
   auto& entry = vc::cusparseLt_cache[tuple_id];
 
@@ -302,10 +340,26 @@ torch::Tensor cslt_mm_semi_structured(
         sizeof(dBias)));
   }
 
-  // float alpha = 1.0;
-  float alpha = alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
+  // float scale = scale_opt.has_value() ? static_cast<float>(*scale_opt) : 1.0;
+  // float beta = 0.0;
+  // auto scale_ptr = &scale;
+
+  float scale = 1.0;
+  auto scale_ptr = &scale;
   float beta = 0.0;
-  auto alpha_ptr = &alpha;
+
+  if (scale_opt.has_value()) {
+    const auto scale_tensor = scale_opt.has_value() ? *scale_opt : at::Tensor{};
+    if (scale_tensor.numel() == 1) {
+      scale = scale_tensor.item<float>();
+    } else {
+      auto tensor_alpha_mode = 1;
+      TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+          &vc::handle, entry.matmul_p, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING,
+          &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
+      scale_ptr = static_cast<float*>(scale_tensor.data_ptr());
+    }
+  }
 
   auto res_tensor_options =
       c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
@@ -314,22 +368,23 @@ torch::Tensor cslt_mm_semi_structured(
 
   if (found) {
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
+        &vc::handle, entry.plan_p, scale_ptr, compressed_A.data_ptr(),
         dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
         entry.workspace_ptr, &stream, 1));
   } else {
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulSearch(
-        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
+        &vc::handle, entry.plan_p, scale_ptr, compressed_A.data_ptr(),
         dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
         entry.workspace_ptr, &stream, 1));
   }
   return res;
 }
 
-torch::Tensor cslt_mm_fp8_semi_structured2(
+torch::Tensor cslt_mm_semi_structured2(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt) {
+    const c10::optional<torch::Tensor>& scale_opt,
+    const c10::optional<torch::Tensor>& bias_opt,
+    const std::optional<torch::ScalarType> out_dtype_opt) {
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
@@ -385,10 +440,65 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
       break;
   }
 
+  auto out_dtype = dense_B.scalar_type();
+  if (out_dtype_opt.has_value()) {
+    out_dtype = out_dtype_opt.value();
+    if (input_type == CUDA_R_8I) {
+      switch (out_dtype) {
+        case at::ScalarType::Char:
+          output_type = CUDA_R_8I;
+          C_type = CUDA_R_8I;
+          break;
+        case at::ScalarType::Half:
+          C_type = CUDA_R_16F;
+          output_type = CUDA_R_16F;
+          break;
+        case at::ScalarType::BFloat16:
+          C_type = CUDA_R_16BF;
+          output_type = CUDA_R_16BF;
+          break;
+        case at::ScalarType::Int:
+          C_type = CUDA_R_32I;
+          output_type = CUDA_R_32I;
+          break;
+        default:
+          TORCH_CHECK(false,
+                      "Unsupported out_dtype passed, must be one of {fp16, "
+                      "bf16, int32} for int8 inputs");
+          break;
+      }
+    } else if (input_type == CUDA_R_8F_E4M3) {
+      switch (out_dtype) {
+        case at::ScalarType::Float8_e4m3fn:
+          output_type = CUDA_R_8F_E4M3;
+          C_type = CUDA_R_16F;
+          break;
+        case at::ScalarType::Half:
+          output_type = CUDA_R_16F;
+          C_type = CUDA_R_16F;
+          break;
+        case at::ScalarType::BFloat16:
+          output_type = CUDA_R_16BF;
+          C_type = CUDA_R_16BF;
+          break;
+        case at::ScalarType::Float:
+          output_type = CUDA_R_32F;
+          C_type = CUDA_R_32F;
+          break;
+        default:
+          TORCH_CHECK(false,
+                      "Unsupported out_dtype passed, must be one of {fp16, "
+                      "bf16, float32} for fp8 inputs");
+          break;
+      }
+    } else {
+      TORCH_CHECK(false,
+                  "out_dtype support only available for int8/fp8 inputs");
+    }
+  }
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
-  auto out_dtype = dense_B.scalar_type();
 
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
@@ -433,11 +543,22 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
         sizeof(dBias)));
   }
 
+  float scale = 1.0;
+  auto scale_ptr = &scale;
   float beta = 0.0;
-  const float alpha =
-      alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
-  auto alpha_ptr = &alpha;
 
+  if (scale_opt.has_value()) {
+    const auto scale_tensor = scale_opt.has_value() ? *scale_opt : at::Tensor{};
+    if (scale_tensor.numel() == 1) {
+      scale = scale_tensor.item<float>();
+    } else {
+      auto tensor_alpha_mode = 1;
+      TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+          &vc::handle, &matmul, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING,
+          &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
+      scale_ptr = static_cast<float*>(scale_tensor.data_ptr());
+    }
+  }
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
       &vc::handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
   TORCH_CUDASPARSE_CHECK(
@@ -452,7 +573,7 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmul(&vc::handle, &plan, alpha_ptr, compressed_A.data_ptr(),
+      cusparseLtMatmul(&vc::handle, &plan, scale_ptr, compressed_A.data_ptr(),
                        dense_B.data_ptr(), &beta, res.data_ptr(),
                        res.data_ptr(), workspace_ptr.get(), &stream, 1));
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 99d97547b85b2..908b052d21997 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -329,14 +329,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_mm_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "float!? scale, Tensor!? bias) -> Tensor");
+      "Tensor!? scale, Tensor!? bias, ScalarType!? output_dtype) -> Tensor");
   ops.impl("cslt_mm_semi_structured", torch::kCUDA, &cslt_mm_semi_structured);
 
   ops.def(
-      "cslt_mm_fp8_semi_structured2(Tensor! compressed_A, Tensor! denseB,"
-      "float!? scale, Tensor!? bias) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured2", torch::kCUDA,
-           &cslt_mm_fp8_semi_structured2);
+      "cslt_mm_semi_structured2(Tensor! compressed_A, Tensor! denseB,"
+      "Tensor!? scale, Tensor!? bias, ScalarType!? output_dtype) -> Tensor");
+  ops.impl("cslt_mm_semi_structured2", torch::kCUDA, &cslt_mm_semi_structured2);
 
   ops.def("cslt_clear_cache() -> ()");
   ops.impl("cslt_clear_cache", &cslt_clear_cache);
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index e107630979250..bc339d1ac9c32 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -3,14 +3,13 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    compress_to_torch_sparse_semi_structured_mat,
+    clear_cache, compress_to_torch_sparse_semi_structured_mat,
     decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
     semi_structured_dense_sparse_T_gemm_scaled,
     semi_structured_sparse_dense_gemm,
-    semi_structured_sparse_dense_gemm_scaled,
-    clear_cache)
+    semi_structured_sparse_dense_gemm_scaled)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -124,16 +123,20 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
 def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     M, N, K = (32, 64, 32)
     dtype = torch.float8_e4m3fn
-    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype=dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
-    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
-
-    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    B = torch.full((N, K), .25, device='cuda', dtype=dtype).t()
+    bias = torch.ones(1, N, dtype=torch.float32, device='cuda')
+
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32) + bias
+    C_sparse = semi_structured_sparse_dense_gemm(A,
+                                                 B,
+                                                 out_dtype=torch.float32,
+                                                 bias=bias)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
     # Cached version
-    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+    B = torch.full((N, K), .25, device='cuda', dtype=dtype).t()
     C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
     C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
@@ -178,11 +181,15 @@ def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
     B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
     A = torch.full((M, K), .25, device='cuda', dtype=dtype)
 
-    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C_sparse = semi_structured_dense_sparse_T_gemm(A,
+                                                   B_T,
+                                                   out_dtype=torch.float32)
     C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
-    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C_sparse = semi_structured_dense_sparse_T_gemm(A,
+                                                   B_T,
+                                                   out_dtype=torch.float32)
     C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
     clear_cache()
@@ -216,26 +223,39 @@ def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
     # cached
     B = torch.rand((K, N), device='cuda').to(torch.float16).t()
     B_fp8, scale_B = to_float8(B)
+    scale_A_vec = scale_A.repeat(M)
 
     C = torch._scaled_mm(A_pruned_fp8,
                          B_fp8,
                          scale_a=scale_A,
                          scale_b=scale_B,
                          out_dtype=torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
-                                                        B_fp8,
-                                                        scale_a=scale_A,
-                                                        scale_b=scale_B).to(
-                                                            torch.float32)
+    # tensor-wise
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(
+        A_fp8_sparse,
+        B_fp8,
+        scale_a=scale_A,
+        scale_b=scale_B,
+        out_dtype=torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+    # channel-wise
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(
+        A_fp8_sparse,
+        B_fp8,
+        scale_a=scale_A_vec,
+        scale_b=scale_B,
+        out_dtype=torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
 
     # noncached
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
-                                                        B_fp8,
-                                                        scale_a=scale_A,
-                                                        scale_b=scale_B,
-                                                        cached=False).to(
-                                                            torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(
+        A_fp8_sparse,
+        B_fp8,
+        scale_a=scale_A,
+        scale_b=scale_B,
+        cached=False,
+        out_dtype=torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
     clear_cache()
 
@@ -252,11 +272,12 @@ def test_torch_semi_structured_dense_sparse_T_fp8_scaled_matmul():
     B_T_pruned_fp8, scale_b = to_float8(B_T_pruned)
     B_T_packed = compress_to_torch_sparse_semi_structured_mat(B_T_pruned_fp8)
 
-    C_sparse = semi_structured_dense_sparse_T_gemm_scaled(A_fp8,
-                                                          B_T_packed,
-                                                          scale_a=scale_a,
-                                                          scale_b=scale_b).to(
-                                                              torch.float32)
+    C_sparse = semi_structured_dense_sparse_T_gemm_scaled(
+        A_fp8,
+        B_T_packed,
+        scale_a=scale_a,
+        scale_b=scale_b,
+        out_dtype=torch.float32)
     C = torch._scaled_mm(B_T_pruned_fp8,
                          A_fp8.t(),
                          scale_a=scale_b,
@@ -278,6 +299,7 @@ def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul():
 
     scale_a = torch.tensor(2.0, dtype=torch.float32, device='cuda')
     scale_b = torch.tensor(2.0, dtype=torch.float32, device='cuda')
+    scale_a_vec = scale_a.repeat(M)
 
     C = dense_matmul(A_pruned,
                      B.t(),
@@ -290,4 +312,11 @@ def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul():
                                                         scale_b=scale_b).to(
                                                             torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A,
+                                                        B.t(),
+                                                        scale_a=scale_a_vec,
+                                                        scale_b=scale_b).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
     clear_cache()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d5c04d94e6d50..8217251ab831f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -713,21 +713,25 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
 
-def semi_structured_mm(A_compressed: torch.Tensor,
-                       B_dense: torch.Tensor,
-                       scale: Optional[float] = None,
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def semi_structured_mm(
+        A_compressed: torch.Tensor,
+        B_dense: torch.Tensor,
+        scale: Optional[float] = None,
+        bias: Optional[torch.Tensor] = None,
+        out_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
     return torch.ops._C.cslt_mm_semi_structured(A_compressed, B_dense, scale,
-                                                bias)
+                                                bias, out_dtype)
 
 
-def semi_structured_fp8_mm2(
+def semi_structured_mm2(
         A_compressed: torch.Tensor,
         B_dense: torch.Tensor,
         scale: Optional[float] = None,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return torch.ops._C.cslt_mm_fp8_semi_structured2(A_compressed, B_dense,
-                                                     scale, bias)
+        bias: Optional[torch.Tensor] = None,
+        out_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+    return torch.ops._C.cslt_mm_semi_structured2(A_compressed, B_dense, scale,
+                                                 bias, out_dtype)
+
 
 def semi_structured_clear_cache() -> None:
     return torch.ops._C.cslt_clear_cache()
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index f98631ab04a49..38b1fb12e8835 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -4,12 +4,18 @@
                           SparseSemiStructuredTensorCUSPARSELT,
                           to_sparse_semi_structured)
 
-from vllm._custom_ops import (cutlass_scaled_mm, semi_structured_fp8_compress,
-                              semi_structured_fp8_mm2, semi_structured_mm, semi_structured_clear_cache)
+from vllm._custom_ops import (cutlass_scaled_mm, semi_structured_clear_cache,
+                              semi_structured_fp8_compress, semi_structured_mm,
+                              semi_structured_mm2)
 from vllm.platforms import current_platform
 
 SparseSemiStructuredTensor._FORCE_CUTLASS = False
 
+#
+# Allocating a dummy tensor to pass as input_scale
+TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() \
+            if current_platform.is_rocm() else None
+
 
 def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
     '''
@@ -62,18 +68,48 @@ def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
                       device=packed_tensor.device).t())
 
 
+def _pad_dense_input(dense_input: torch.Tensor) -> torch.Tensor:
+    """
+    Calculates padding for dense tensor and pads tensor if necessary.
+    If padding is not required, this function returns the original tensor.
+    """
+    # only 2d matmul
+    assert dense_input.dim() == 2
+    if torch.float8_e4m3fn not in \
+        SparseSemiStructuredTensorCUSPARSELT._DTYPE_SHAPE_CONSTRAINTS:
+        SparseSemiStructuredTensorCUSPARSELT._DTYPE_SHAPE_CONSTRAINTS[
+            torch.float8_e4m3fn] = \
+        SparseSemiStructuredTensorCUSPARSELT._DTYPE_SHAPE_CONSTRAINTS[torch.int8]
+    # check shape
+    m, n = dense_input.shape
+    min_rows = SparseSemiStructuredTensorCUSPARSELT._DTYPE_SHAPE_CONSTRAINTS[
+        dense_input.dtype].dense_min_rows
+    min_cols = SparseSemiStructuredTensorCUSPARSELT._DTYPE_SHAPE_CONSTRAINTS[
+        dense_input.dtype].dense_min_cols
+
+    # calculate padding
+    to_pad_m = -m % min_rows if m < min_rows or m % min_rows else 0
+    to_pad_n = -n % min_cols if n < min_cols or n % min_rows else 0
+    if to_pad_m or to_pad_n:
+        return torch.nn.functional.pad(dense_input, (0, to_pad_n, 0, to_pad_m))
+    else:
+        return dense_input
+
+
 def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
                                       b_dense: torch.Tensor,
                                       bias: torch.Tensor = None,
+                                      out_dtype: torch.dtype = None,
                                       cached: bool = True):
     '''
     Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices.
     In case of int8 and fp8 types, dense matrix B has to be non-contiguous.
     Args:
         a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
-        b_dense (torch.Tensor) - dense matrix tensor.
+        b_dense (torch.Tensor) - dense matrix tensor. For int8 and fp8 has to be non-contiguous.
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
-        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
+        out_dtype (torch.dtype) - Type of returned tensor for int8 or fp8 matmul. default: None, i.e. quantized output.
+        cached (bool) - use cached (faster) version of cusparseLt wrapper.
 
     Result:
         torch.Tensor - Result of matrix multiplication.
@@ -86,30 +122,39 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
     ]:
         raise ValueError("cuSparseLt does not support"
                          "contiguous dense matrix for int8 and fp8 types")
+    if a_packed.dtype in [torch.float16, torch.bfloat16]:
+        assert out_dtype is None, \
+            "out_dtype is a parameter for quantized inputs"
 
+    row, col = b_dense.shape
+    b_dense = _pad_dense_input(b_dense)
     if cached:
-        return semi_structured_mm(a_packed.packed, b_dense, bias=bias)
+        result = semi_structured_mm(a_packed.packed,
+                                    b_dense,
+                                    bias=bias,
+                                    out_dtype=out_dtype)
     else:
-        if a_packed.dtype == torch.float8_e4m3fn:
-            return semi_structured_fp8_mm2(a_packed.packed, b_dense, bias=bias)
-        else:
-            result = torch.mm(a_packed, b_dense)
-            if bias is not None:
-                result = torch.add(result, bias)
-            return result
+        result = semi_structured_mm2(a_packed.packed,
+                                     b_dense,
+                                     bias=bias,
+                                     out_dtype=out_dtype)
+
+    return result[:, :col]
 
 
 def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
                                         b_T_packed: torch.Tensor,
                                         bias: torch.Tensor = None,
+                                        out_dtype: torch.dtype = None,
                                         cached: bool = True):
     '''
     Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
     Args:
-        a_dense (torch.Tensor) - dense matrix tensor.
+        a_dense (torch.Tensor) - dense matrix tensor. For int8 and fp8 has to be contiguous.
         b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
-        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
+        out_dtype (torch.dtype) - Type of returned tensor for int8 or fp8 matmul. default: None, i.e. quantized output.
+        cached (bool) - use cached (faster) version of cusparseLt wrapper.
     
     Returns:
         torch.Tensor - Result of matrix multiplication.
@@ -117,6 +162,7 @@ def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
     return (semi_structured_sparse_dense_gemm(b_T_packed,
                                               a_dense.t(),
                                               bias=bias,
+                                              out_dtype=out_dtype,
                                               cached=cached)).t()
 
 
@@ -125,34 +171,68 @@ def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
                                              scale_a: torch.Tensor,
                                              scale_b: torch.Tensor,
                                              bias: torch.Tensor = None,
-                                             cached: bool = False):
+                                             out_dtype: torch.dtype = None,
+                                             cached: bool = True):
     '''
     Performs scaled matrix multiplication (a @ b) of transposed semi-structured sparse and dense fp8 matrices
     Args:
         a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
-        b_dense (torch.Tensor) - dense matrix tensor.
+        b_dense (torch.Tensor) - dense matrix tensor. For int8 and fp8 has to be non-contiguous.
         scale_a (torch.Tensor) - scaling factor for sparse matrix, must be in float32.
         scale_b (torch.Tensor) - scaling factor for dense matrix, must be in float32.
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
-        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
+        out_dtype (torch.dtype) - Type of returned tensor for int8 or fp8 matmul. default: None, i.e. quantized output.
+        cached (bool) - use cached (faster) version of cusparseLt wrapper.
 
     Returns:
         torch.Tensor - Result of matrix multiplication.
     ''' # noqa: E501
+    assert a_packed.dtype in [
+        torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
+    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
+
+    if b_dense.is_contiguous() and a_packed.dtype in [
+            torch.int8, torch.float8_e4m3fn
+    ]:
+        raise ValueError("cuSparseLt does not support"
+                         "contiguous dense matrix for int8 and fp8 types")
+    if a_packed.dtype in [torch.float16, torch.bfloat16]:
+        assert out_dtype is None, \
+            "out_dtype is a parameter for quantized inputs"
 
-    # cusparseLt requires alpha to be float
+    # cusparseLt requires scale to be float
     assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
-    scale = (scale_a * scale_b).item()
-    if cached:
-        return semi_structured_mm(a_packed.packed,
-                                  b_dense,
-                                  scale=scale,
-                                  bias=bias)
+    row, col = b_dense.shape
+    b_dense = _pad_dense_input(b_dense)
+
+    per_tensor_weights = (scale_a.numel() == 1)
+    per_tensor_activations = (scale_b.numel() == 1)
+
+    def matmul_(a, b, **kwargs):
+        if cached:
+            return semi_structured_mm(a, b, **kwargs)
+        else:
+            return semi_structured_mm2(a, b, **kwargs)
+
+    if a_packed.dtype == torch.float8_e4m3fn:
+        scale = scale_a * scale_b
+        result = matmul_(a_packed.packed, b_dense, out_dtype=torch.float32)
+
+        result = torch.narrow(result, 1, 0, col)
+        result = result * scale
+        result = result.to(out_dtype)
+        if bias is not None:
+            result = result + bias
     else:
-        return semi_structured_fp8_mm2(a_packed.packed,
-                                       b_dense,
-                                       bias=bias,
-                                       scale=scale)
+        scale = scale_a * scale_b
+        if per_tensor_weights and per_tensor_activations:
+            scale = scale.repeat(a_packed.shape[0])
+        result = matmul_(a_packed.packed,
+                         b_dense,
+                         scale=scale,
+                         bias=bias,
+                         out_dtype=out_dtype)
+    return result
 
 
 def semi_structured_dense_sparse_T_gemm_scaled(a_dense: torch.Tensor,
@@ -160,28 +240,35 @@ def semi_structured_dense_sparse_T_gemm_scaled(a_dense: torch.Tensor,
                                                scale_a: torch.Tensor = None,
                                                scale_b: torch.Tensor = None,
                                                bias: torch.Tensor = None,
+                                               out_dtype: torch.dtype = None,
                                                cached: bool = True):
     '''
     Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
     Args:
-        a_dense (torch.Tensor) - dense matrix tensor.
+        a_dense (torch.Tensor) - dense matrix tensor. For int8 and fp8 has to be contiguous.
         b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
+        scale_a (torch.Tensor) - scaling factor for sparse matrix, must be in float32.
+        scale_b (torch.Tensor) - scaling factor for dense matrix, must be in float32.
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        out_dtype (torch.dtype) - Type of returned tensor for int8 or fp8 matmul. default: None, i.e. quantized output.
         cached (bool) - whether to use cached(faster) version of cusparseLt wrapper.
     
     Returns:
         torch.Tensor - Result of matrix multiplication.
     '''  # noqa: E501
-    return (semi_structured_sparse_dense_gemm_scaled(b_T_packed,
-                                                     a_dense.t(),
-                                                     scale_a=scale_b,
-                                                     scale_b=scale_a,
-                                                     bias=bias,
-                                                     cached=cached)).t()
+    return (semi_structured_sparse_dense_gemm_scaled(
+        b_T_packed,
+        a_dense.t(),
+        scale_a=scale_b,
+        scale_b=scale_a,
+        bias=bias,
+        cached=cached)).t().contiguous()
+
 
 def clear_cache():
     semi_structured_clear_cache()
 
+
 # test utils
 def dense_matmul(A, B, dtype, scale_a=None, scale_b=None):
     if dtype in [torch.int8, torch.float8_e4m3fn]:

From 31cf482ef401618e83cd1b4949b1488767c28f86 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 13 Nov 2024 11:17:49 +0000
Subject: [PATCH 37/39] Fix and polish

---
 CMakeLists.txt                                | 13 +++--
 .../cusparseLt_benchmarks/benchmark_24.py     | 55 ++++++++++++-------
 .../fp8_semi_structured/cusparseLt.cpp        |  4 --
 .../sparsity/utils/cusparse_2_4_utils.py      | 28 ++++++++--
 4 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 592fb6f4ea581..6106fad378bc7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -402,13 +402,14 @@ define_gpu_extension_target(
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
 # If cuSparseLt is not installed we skip 2:4 optimizations
-CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT)
-message(STATUS "Result of include cusparseLt ${HAVE_CUSPARSELT}")
-target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+find_path(CUSPARSELT_INCLUDE_PATH cusparseLt.h
+  HINTS ${CUSPARSELT_INCLUDE_DIR}
+  PATH_SUFFIXES cuda/include cuda include)
 
-# if(HAVE_CUSPARSELT)
-#   target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
-# endif()
+if(CUSPARSELT_INCLUDE_PATH)
+  message(STATUS "CuSparseLt header file found ${CUSPARSELT_INCLUDE_PATH}")
+  target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+endif()
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 594db20cb5fb5..d5fcc369347bb 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -13,6 +13,8 @@
     is_semi_structured_supported, semi_structured_sparse_dense_gemm,
     semi_structured_sparse_dense_gemm_scaled)
 from vllm.utils import FlexibleArgumentParser
+import time
+import pickle
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [32, 64, 128, 256, 512]
@@ -105,16 +107,30 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
     a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
     # warmup
     scale = torch.tensor(1.0, dtype=torch.float32, device='cuda')
-    semi_structured_sparse_dense_gemm_scaled(a_compressed,
-                                             b,
-                                             scale_a=scale,
-                                             scale_b=scale)
 
     semi_structured_sparse_dense_gemm(a_compressed, b)
     timers.append(
         bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
                  semi_structured_sparse_dense_gemm, a_compressed, b))
 
+
+    semi_structured_sparse_dense_gemm_scaled(a_compressed,
+                                             b,
+                                             scale_a=scale,
+                                             scale_b=scale)
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_scaled",
+                 semi_structured_sparse_dense_gemm_scaled, a_compressed, b, scale, scale))
+    
+    scale_vec = scale.repeat(a_compressed.shape[0])
+    semi_structured_sparse_dense_gemm_scaled(a_compressed,
+                                             b,
+                                             scale_a=scale_vec,
+                                             scale_b=scale)
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_scaled_channel",
+                 semi_structured_sparse_dense_gemm_scaled, a_compressed, b, scale_vec, scale))
+
     timers.append(
         bench_fn(label,
                  sub_label,
@@ -168,11 +184,6 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
-    # MKNs = [(1024, 8192, 14336)]
-    # MKNs = [(2048, 8192, 14336)]
-    # MKNs = [(2048, 8192, 14336), (2048, 8192, 14336)]
-    # MKNs = [(32, 11008, 4096)]
-    # MKNs = [(2048, 11008, 14336)]
 
     for m, k, n in MKNs:
         timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
@@ -182,14 +193,6 @@ def run(MKNs: Iterable[Tuple[int, int, int]],
     return results
 
 
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-
 def run_model_bench(args):
     if not is_semi_structured_supported():
         raise ValueError("Device does not support semi-structured sparsity")
@@ -227,6 +230,15 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
         print_timers(data)
 
+    if args.save_results:
+        timestamp = int(time.time())
+
+        all_data = []
+        for d in model_bench_data:
+            all_data.extend(d)
+        # pickle all data
+        with open(f"model_bench-{timestamp}.pkl", "wb") as f:
+            pickle.dump(all_data, f)
 
 if __name__ == '__main__':
 
@@ -238,8 +250,8 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         python3 ./benchmarks/cusparseLt_benchmarks/benchmark_24.py --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
     
     
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cusparseLt implementations for the various GEMMs.
+    Output if --save-results:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch, cutlass and cusparseLt implementations for the various GEMMs.
             """,  # noqa: E501
         formatter_class=argparse.RawTextHelpFormatter)
 
@@ -261,5 +273,10 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         action='store_true',
         help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
 
+    parser.add_argument(
+        '--save-results',
+        action='store_true',
+        help='Save results to a pickle file named model_bench_{timestamp}.pkl')
+
     args = parser.parse_args()
     run_model_bench(args)
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index d01b07d14d387..d274075b194e3 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -340,10 +340,6 @@ torch::Tensor cslt_mm_semi_structured(
         sizeof(dBias)));
   }
 
-  // float scale = scale_opt.has_value() ? static_cast<float>(*scale_opt) : 1.0;
-  // float beta = 0.0;
-  // auto scale_ptr = &scale;
-
   float scale = 1.0;
   auto scale_ptr = &scale;
   float beta = 0.0;
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 38b1fb12e8835..e312eca587132 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -117,7 +117,7 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
-    if b_dense.is_contiguous() and a_packed.dtype in [
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
             torch.int8, torch.float8_e4m3fn
     ]:
         raise ValueError("cuSparseLt does not support"
@@ -128,6 +128,12 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
 
     row, col = b_dense.shape
     b_dense = _pad_dense_input(b_dense)
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
+            torch.int8, torch.float8_e4m3fn
+    ]:
+        # We have to provide non-contiguous b_dense to cusparseLt for int8 and fp8
+        b_dense = b_dense.t().contiguous().t()
+
     if cached:
         result = semi_structured_mm(a_packed.packed,
                                     b_dense,
@@ -191,10 +197,10 @@ def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
 
-    if b_dense.is_contiguous() and a_packed.dtype in [
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
             torch.int8, torch.float8_e4m3fn
     ]:
-        raise ValueError("cuSparseLt does not support"
+        raise ValueError("cuSparseLt does not support "
                          "contiguous dense matrix for int8 and fp8 types")
     if a_packed.dtype in [torch.float16, torch.bfloat16]:
         assert out_dtype is None, \
@@ -205,6 +211,12 @@ def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
     row, col = b_dense.shape
     b_dense = _pad_dense_input(b_dense)
 
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
+            torch.int8, torch.float8_e4m3fn
+    ]:
+        # We have to provide non-contiguous b_dense to cusparseLt for int8 and fp8
+        b_dense = b_dense.t().contiguous().t()
+
     per_tensor_weights = (scale_a.numel() == 1)
     per_tensor_activations = (scale_b.numel() == 1)
 
@@ -214,24 +226,27 @@ def matmul_(a, b, **kwargs):
         else:
             return semi_structured_mm2(a, b, **kwargs)
 
+    scale = scale_a * scale_b
     if a_packed.dtype == torch.float8_e4m3fn:
-        scale = scale_a * scale_b
         result = matmul_(a_packed.packed, b_dense, out_dtype=torch.float32)
-
         result = torch.narrow(result, 1, 0, col)
         result = result * scale
         result = result.to(out_dtype)
         if bias is not None:
             result = result + bias
     else:
-        scale = scale_a * scale_b
         if per_tensor_weights and per_tensor_activations:
+            # cuSparseLt requires per-tensor scale to be on host
+            # and channel-wise scales on device.
+            # In order to make it work with CUDAGraphs
+            # we replicate per-tensor scale to channel-wise scales
             scale = scale.repeat(a_packed.shape[0])
         result = matmul_(a_packed.packed,
                          b_dense,
                          scale=scale,
                          bias=bias,
                          out_dtype=out_dtype)
+        result = torch.narrow(result, 1, 0, col)
     return result
 
 
@@ -262,6 +277,7 @@ def semi_structured_dense_sparse_T_gemm_scaled(a_dense: torch.Tensor,
         scale_a=scale_b,
         scale_b=scale_a,
         bias=bias,
+        out_dtype=out_dtype,
         cached=cached)).t().contiguous()
 
 

From 68512d4ff0832016037376012f47a2b1b9dc49b8 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 13 Nov 2024 11:33:44 +0000
Subject: [PATCH 38/39] Formatting

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 109 +++++++++---------
 .../sparsity/utils/cusparse_2_4_utils.py      |  34 +++---
 2 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index d5fcc369347bb..cb861c6634c88 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -1,6 +1,8 @@
 import argparse
 import copy
 import itertools
+import pickle
+import time
 from typing import Callable, Iterable, List, Tuple
 
 import torch
@@ -13,8 +15,6 @@
     is_semi_structured_supported, semi_structured_sparse_dense_gemm,
     semi_structured_sparse_dense_gemm_scaled)
 from vllm.utils import FlexibleArgumentParser
-import time
-import pickle
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [32, 64, 128, 256, 512]
@@ -54,21 +54,15 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
 
     timers = []
     # pytorch float16
-    # timers.append(
-    #     bench_fn(label, sub_label, "pytorch_fp16_fp16_matmul", torch.mm,
-    #              a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # # pytorch bf16
-    # timers.append(
-    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul", torch.mm,
-    #              a.to(dtype=torch.bfloat16, device="cuda"),
-    #              b.to(dtype=torch.bfloat16, device="cuda")))
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_fp16_fp16_matmul", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
 
-    # # cusparseLt fp16
-    # timers.append(
-    #     bench_fn(label, sub_label, "cusparseLt_fp16_fp16_2_4",
-    #              semi_structured_sparse_dense_gemm,
-    #              compress_to_torch_sparse_semi_structured_mat(a), b))
+    # cusparseLt fp16
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_fp16_fp16_2_4",
+                 semi_structured_sparse_dense_gemm,
+                 compress_to_torch_sparse_semi_structured_mat(a), b))
 
     # timers.append(
     #     bench_fn(label,
@@ -79,14 +73,17 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
     #              b,
     #              cached=False))
 
-    # # cusparseLt bf16
-    # a, b = make_rand_tensors(torch.bfloat16, m, n, k)
-    # a_compressed = compress_to_torch_sparse_semi_structured_mat(
-    #     a.to(dtype=torch.bfloat16))
+    # pytorch bf16
+    a, b = make_rand_tensors(torch.bfloat16, m, n, k)
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul", torch.mm, a, b))
 
-    # timers.append(
-    #     bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4",
-    #              semi_structured_sparse_dense_gemm, a_compressed, b))
+    # cusparseLt bf16
+    a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
+
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4",
+                 semi_structured_sparse_dense_gemm, a_compressed, b))
 
     # timers.append(
     #     bench_fn(label,
@@ -99,9 +96,9 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
 
     a, b = make_rand_tensors(torch.int8, m, n, k)
     # # cutlass i8
-    # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul", dense_matmul, a, b,
-    #              torch.int8))
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_matmul_scaled", dense_matmul,
+                 a, b, torch.int8))
 
     # cusparseLt i8
     a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
@@ -113,38 +110,39 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
         bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
                  semi_structured_sparse_dense_gemm, a_compressed, b))
 
-
     semi_structured_sparse_dense_gemm_scaled(a_compressed,
                                              b,
                                              scale_a=scale,
                                              scale_b=scale)
     timers.append(
         bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_scaled",
-                 semi_structured_sparse_dense_gemm_scaled, a_compressed, b, scale, scale))
-    
-    scale_vec = scale.repeat(a_compressed.shape[0])
-    semi_structured_sparse_dense_gemm_scaled(a_compressed,
-                                             b,
-                                             scale_a=scale_vec,
-                                             scale_b=scale)
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_scaled_channel",
-                 semi_structured_sparse_dense_gemm_scaled, a_compressed, b, scale_vec, scale))
+                 semi_structured_sparse_dense_gemm_scaled, a_compressed, b,
+                 scale, scale))
+
+    # scale_vec = scale.repeat(a_compressed.shape[0])
+    # semi_structured_sparse_dense_gemm_scaled(a_compressed,
+    #                                          b,
+    #                                          scale_a=scale_vec,
+    #                                          scale_b=scale)
+    # timers.append(
+    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_scaled_channel",
+    #              semi_structured_sparse_dense_gemm_scaled, a_compressed, b,
+    #              scale_vec, scale))
 
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_i8_i8_2_4_noncached",
-                 semi_structured_sparse_dense_gemm,
-                 a_compressed,
-                 b,
-                 cached=False))
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "cusparseLt_i8_i8_2_4_noncached",
+    #              semi_structured_sparse_dense_gemm,
+    #              a_compressed,
+    #              b,
+    #              cached=False))
 
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
         timers.append(
-            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
+            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul_scaled",
                      dense_matmul, a, b, torch.float8_e4m3fn))
 
         # cusparseLt fp8
@@ -159,18 +157,18 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
 
         semi_structured_sparse_dense_gemm_scaled(a_compressed, b, scale, scale)
         timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_scale",
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_scaled",
                      semi_structured_sparse_dense_gemm_scaled, a_compressed, b,
                      scale, scale))
 
-        timers.append(
-            bench_fn(label,
-                     sub_label,
-                     "cusparseLt_fp8_fp8_2_4_noncached",
-                     semi_structured_sparse_dense_gemm,
-                     a_compressed,
-                     b,
-                     cached=False))
+        # timers.append(
+        #     bench_fn(label,
+        #              sub_label,
+        #              "cusparseLt_fp8_fp8_2_4_noncached",
+        #              semi_structured_sparse_dense_gemm,
+        #              a_compressed,
+        #              b,
+        #              cached=False))
 
     return timers
 
@@ -240,6 +238,7 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         with open(f"model_bench-{timestamp}.pkl", "wb") as f:
             pickle.dump(all_data, f)
 
+
 if __name__ == '__main__':
 
     parser = FlexibleArgumentParser(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index e312eca587132..5a42dd03f9170 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -117,9 +117,10 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
-    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
-            torch.int8, torch.float8_e4m3fn
-    ]:
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1
+            and b_dense.is_contiguous()) and a_packed.dtype in [
+                torch.int8, torch.float8_e4m3fn
+            ]:
         raise ValueError("cuSparseLt does not support"
                          "contiguous dense matrix for int8 and fp8 types")
     if a_packed.dtype in [torch.float16, torch.bfloat16]:
@@ -128,10 +129,12 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
 
     row, col = b_dense.shape
     b_dense = _pad_dense_input(b_dense)
-    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
-            torch.int8, torch.float8_e4m3fn
-    ]:
-        # We have to provide non-contiguous b_dense to cusparseLt for int8 and fp8
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1
+            and b_dense.is_contiguous()) and a_packed.dtype in [
+                torch.int8, torch.float8_e4m3fn
+            ]:
+        # We have to provide non-contiguous b_dense
+        # to cusparseLt for int8 and fp8
         b_dense = b_dense.t().contiguous().t()
 
     if cached:
@@ -197,9 +200,10 @@ def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
 
-    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
-            torch.int8, torch.float8_e4m3fn
-    ]:
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1
+            and b_dense.is_contiguous()) and a_packed.dtype in [
+                torch.int8, torch.float8_e4m3fn
+            ]:
         raise ValueError("cuSparseLt does not support "
                          "contiguous dense matrix for int8 and fp8 types")
     if a_packed.dtype in [torch.float16, torch.bfloat16]:
@@ -211,10 +215,12 @@ def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
     row, col = b_dense.shape
     b_dense = _pad_dense_input(b_dense)
 
-    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1 and b_dense.is_contiguous()) and a_packed.dtype in [
-            torch.int8, torch.float8_e4m3fn
-    ]:
-        # We have to provide non-contiguous b_dense to cusparseLt for int8 and fp8
+    if (b_dense.shape[0] > 1 and b_dense.shape[1] > 1
+            and b_dense.is_contiguous()) and a_packed.dtype in [
+                torch.int8, torch.float8_e4m3fn
+            ]:
+        # We have to provide non-contiguous b_dense
+        # to cusparseLt for int8 and fp8
         b_dense = b_dense.t().contiguous().t()
 
     per_tensor_weights = (scale_a.numel() == 1)

From 72d6cd3ba4c0066ad922efd8e0207eae791ba083 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Fri, 15 Nov 2024 16:53:10 +0000
Subject: [PATCH 39/39] Minor test and benchmarks updates

---
 benchmarks/cusparseLt_benchmarks/benchmark_24.py  |  6 +++---
 tests/kernels/test_semi_structured.py             | 15 ++++++++++-----
 .../layers/sparsity/utils/cusparse_2_4_utils.py   |  2 ++
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index cb861c6634c88..15381de006d12 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -24,8 +24,8 @@
 # helpers
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                       k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = get_random_mat(m, k, dtype)
-    b = get_random_mat(n, k, dtype).t()
+    a = get_random_mat(n, k, dtype)
+    b = get_random_mat(m, k, dtype).t()
     return a, b
 
 
@@ -213,7 +213,7 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         KNs = model_shapes(model, tp_size)
         MKNs = []
         for m in Ms:
-            assert m % 32 == 0, "Batch size has to be a multiple of 32"
+            assert m % 16 == 0, "Batch size has to be a multiple of 16"
             for k, n in KNs:
                 if k % 32 or n % 32:
                     continue
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index bc339d1ac9c32..2aba096d30a73 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -138,7 +138,10 @@ def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     # Cached version
     B = torch.full((N, K), .25, device='cuda', dtype=dtype).t()
     C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A,
+                                                 B,
+                                                 out_dtype=torch.bfloat16).to(
+                                                     torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
     # Noncached version
@@ -174,8 +177,9 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
     not is_semi_structured_supported()
     or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
-    M, N, K = (32, 64, 32)
+@pytest.mark.parametrize("mnk", MNK)
+def test_torch_semi_structured_dense_sparse_T_fp8_matmul(mnk):
+    M, N, K = mnk
     dtype = torch.float8_e4m3fn
     B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=dtype)
     B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
@@ -290,9 +294,10 @@ def test_torch_semi_structured_dense_sparse_T_fp8_scaled_matmul():
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
     reason="Semi structured matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul():
+@pytest.mark.parametrize("mnk", MNK)
+def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul(mnk):
     dtype = torch.int8
-    M, N, K = (32, 64, 32)
+    M, N, K = mnk
     A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
     B = get_random_mat(N, K, dtype)
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 5a42dd03f9170..7dceadb5d2686 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -234,6 +234,8 @@ def matmul_(a, b, **kwargs):
 
     scale = scale_a * scale_b
     if a_packed.dtype == torch.float8_e4m3fn:
+        if not (per_tensor_activations and per_tensor_weights):
+            scale = scale[:, None]
         result = matmul_(a_packed.packed, b_dense, out_dtype=torch.float32)
         result = torch.narrow(result, 1, 0, col)
         result = result * scale