From 5a2ab25cfa960d9ce74774e73308aafb0e854692 Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Fri, 2 Aug 2024 01:42:01 +0000
Subject: [PATCH 001/106] Moving branch to a different repo

---
 CMakeLists.txt                                |    3 +-
 csrc/moe/marlin_moe_ops.cu                    | 2915 +++++++++++++++++
 csrc/moe/marlin_moe_ops.h                     |   14 +
 csrc/moe/torch_bindings.cpp                   |   19 +-
 tests/kernels/test_moe.py                     |  212 +-
 vllm/_custom_ops.py                           |   13 +
 .../layers/fused_moe/__init__.py              |    4 +
 .../layers/fused_moe/fused_moe.py             |  211 +-
 vllm/model_executor/layers/fused_moe/layer.py |  381 ++-
 .../quantization/utils/marlin_utils_test.py   |   13 +-
 .../layers/quantization/utils/quant_utils.py  |   19 +-
 vllm/model_executor/models/mixtral_quant.py   |  157 +-
 12 files changed, 3860 insertions(+), 101 deletions(-)
 create mode 100644 csrc/moe/marlin_moe_ops.cu
 create mode 100644 csrc/moe/marlin_moe_ops.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d599c5470704..e6c38839c839c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,7 +211,8 @@ define_gpu_extension_target(
 
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
-  "csrc/moe/topk_softmax_kernels.cu")
+  "csrc/moe/topk_softmax_kernels.cu"
+  "csrc/moe/marlin_moe_ops.cu")
 
 define_gpu_extension_target(
   _moe_C
diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
new file mode 100644
index 0000000000000..ebc1693b2ba50
--- /dev/null
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -0,0 +1,2915 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+#define CPU_OFFSETS true
+
+namespace marlin_moe {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
+                           FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+__device__ inline void scale_float(float* c, FragS& s) {
+  __half* s_ptr = reinterpret_cast<__half*>(&s);
+  c[0] = __fmul_rn(c[0], __half2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], __half2float(s_ptr[1]));
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+__device__ inline void scale4(FragB& frag_b, FragS& frag_s_1, FragS& frag_s_2,
+                              FragS& frag_s_3, FragS& frag_s_4, int i) {
+  __half2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<__half*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<__half*>(&frag_s_2)[i];
+
+  __half2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<__half*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<__half*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  int start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = finish_row - start_row;
+
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / blockDim.x;
+    int rest = size_k % blockDim.x;
+
+    int offset = row * row_stride;
+
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += blockDim.x;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+
+__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
+                                    int* __restrict__ expert_offsets,
+                                    int topk_length,
+                                    int block_size) {
+  int expert_id = threadIdx.x;
+  int num_experts = blockDim.x;
+
+  int occurrences = 0;
+  for (int i = 0; i < topk_length; ++i) {
+      occurrences += (topk_ids[i] == expert_id);
+  }
+  expert_offsets[expert_id + 1] = occurrences;
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int tot_offset = 0;
+    expert_offsets[0] = 0;
+    for (int i = 0; i < num_experts; ++i) {
+      tot_offset += ceildiv(expert_offsets[i + 1], block_size) * block_size;
+      expert_offsets[i + 1] = tot_offset;
+    }
+    // for (int i = 0; i < num_experts + 1; ++i) {
+    //   printf("expert offset: %d -> %d (%d %d)\n",
+    //       i, expert_offsets[i], topk_length, block_size);
+    // }
+  }
+  __syncthreads();
+
+}
+
+#if CPU_OFFSETS
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void MarlinMoE(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,  // float topk weights
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int* __restrict__ expert_offsets,
+    int num_groups,              // number of scale groups per output channel
+    int expert_idx,              // idx of current expert // TODO must decide based on offsets
+    int num_experts,             // number of experts
+    int topk,                    // topk parameter of moe
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int tot_m,                   // total number of rows in A and C
+    int* locks,            // extra global storage for barrier synchronization
+    bool replicate_input,  // do we use the same input for each expert?
+    bool apply_weights,     // apply weights to output
+    int try_m_block_ctr, // experiment
+    int* barrier_ctrs
+) {
+
+  // int tot_m_blocks = ceildiv(tot_m, 16);
+  // if (try_m_block_ctr >= tot_m_blocks) {
+  //   return;
+  // }
+
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              ceildiv(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+    sorted_ids += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      sorted_ids += 16 * thread_m_blocks;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups = !has_act_order && group_blocks < thread_k_blocks
+                                  ? thread_k_blocks / group_blocks
+                                  : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  constexpr int sorted_sh_stride = threads;
+  constexpr int sorted_gl_stride = threads;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (group_blocks == -1 || group_blocks == 0) {
+    s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+  } else {
+    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+              s_sh_stride * slice_col + threadIdx.x;
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  int shs_size;
+  if constexpr (has_act_order)
+    shs_size = sh_max_num_groups * s_sh_stride + threads;
+  else
+    shs_size = group_blocks > 0 ? stages * s_sh_stage : threads;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
+  int* sh_sorted = (int*)(sh_s + shs_size);
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++) {
+    int a_idx = a_sh_wr_delta * i + a_sh_wr;
+    int row = a_idx / a_gl_rd_delta_o;
+    if (row >= prob_m) {
+      a_sh_wr_pred[i] = false;
+    } else {
+      a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+    }
+  }
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];         // No act-order
+  FragS act_frag_s[2][4][4];  // For act-order
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
+                                    int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
+        int row = a_idx / a_gl_stride;
+        int sorted_row =
+            replicate_input ? sorted_ids[row] / topk : sorted_ids[row];
+        int new_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
+        if (sorted_row < tot_m * (replicate_input ? 1 : topk) &&
+            new_idx < a_gl_stride * tot_m * (replicate_input ? 1 : topk)) {
+          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[new_idx],
+                         a_sh_wr_pred[i]);
+        }
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // TODO fix
+  auto fetch_sorted_ids_to_shared = [&]() {
+    const int mpt = ceildiv(prob_m, threads);
+    for (int i = 0; i < mpt; i++) {
+      if ((i * sorted_gl_stride) + threadIdx.x < prob_m) {
+        sh_sorted[(i * sorted_sh_stride) + threadIdx.x] =
+            sorted_ids[(i * sorted_gl_stride) + threadIdx.x];
+      }
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
+
+      FragB frag_b0 = dequant(b_quant);
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0);
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b0, frag_s[k % 2][j], 0);
+        }
+      }
+
+      FragB frag_b1 = dequant(b_quant_shift);
+
+      // Apply scale to frag_b1
+      if constexpr (has_act_order) {
+        scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1);
+
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b1, frag_s[k % 2][j], 1);
+        }
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          int c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+          int sorted_row = sorted_ids[c_idx / c_gl_stride];
+          int new_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], &C[new_idx],
+                         sorted_row < tot_m * topk &&
+                             (8 * (i / 2) + row < prob_m &&
+                              (i < (thread_m_blocks - 1) * 4 ||
+                               sorted_ids[8 * (i / 2) + row] < tot_m * topk)));
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (8 * (i / 2) + row < prob_m &&
+            (i < (thread_m_blocks - 1) * 4 ||
+             sorted_ids[8 * (i / 2) + row] < tot_m * topk)) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half*>(&c)[j] =
+                  __float2half(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            int c_idx =
+                c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+            int row = sorted_ids[c_idx / c_gl_stride];
+            if (row < tot_m * topk) {
+              int new_idx = row * c_gl_stride + c_idx % c_gl_stride;
+              C[new_idx] = c;
+            }
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+
+      // For per-column quantization we finally apply the scale here
+      if constexpr (!has_act_order && group_blocks == -1) {
+        res = __hmul2(res, s[0]);
+      }
+
+      ((half2*)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        int row = sorted_ids[c_gl_wr / c_gl_stride];
+        if (row < tot_m * topk) {
+          int off = row * c_gl_stride + c_gl_wr % c_gl_stride;
+          if (!apply_weights) {
+            C[off] = sh[c_sh_rd];
+          } else {
+            __half* ctrg = reinterpret_cast<__half*>(&C[off]);
+            __half* csrc = reinterpret_cast<__half*>(&sh[c_sh_rd]);
+            for (int j = 0; j < 8; ++j) {
+              ctrg[j] = __float2half(topk_weights[row] * __half2float(csrc[j]));
+            }
+          }
+          c_gl_wr += c_gl_wr_delta;
+          c_sh_rd += c_sh_rd_delta;
+        }
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+    // fetch_sorted_ids_to_shared();
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if (last) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if (last) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+          }
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        }
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+// TODO could just run MarlinMoE?
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__device__ inline void RunSingleIter(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,  // float topk weights
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int* __restrict__ expert_offsets,
+    int num_groups,              // number of scale groups per output channel
+    int expert_idx,              // idx of current expert // TODO must decide based on offsets
+    int num_experts,             // number of experts
+    int topk,                    // topk parameter of moe
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int tot_m,                   // total number of rows in A and C
+    int* locks,            // extra global storage for barrier synchronization
+    bool replicate_input,  // do we use the same input for each expert?
+    bool apply_weights,     // apply weights to output
+    int try_m_block_ctr // experiment
+) {
+
+  // if (threadIdx.x == 0 && blockIdx.x == 0) {
+  //   printf("%d, %d\n", thread_m_blocks, prob_m);
+  // }
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              ceildiv(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+    sorted_ids += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      sorted_ids += 16 * thread_m_blocks;
+      // sorted_off += 16 * thread_m_blocks;
+      // printf("advance 2: %d (%d %d)\n", sorted_off, blockIdx.x, threadIdx.x);
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups = !has_act_order && group_blocks < thread_k_blocks
+                                  ? thread_k_blocks / group_blocks
+                                  : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  constexpr int sorted_sh_stride = threads;
+  constexpr int sorted_gl_stride = threads;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (group_blocks == -1 || group_blocks == 0) {
+    s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+  } else {
+    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+              s_sh_stride * slice_col + threadIdx.x;
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  int shs_size;
+  if constexpr (has_act_order)
+    shs_size = sh_max_num_groups * s_sh_stride + threads;
+  else
+    shs_size = group_blocks > 0 ? stages * s_sh_stage : threads;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
+  int* sh_sorted = (int*)(sh_s + shs_size);
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++) {
+    int a_idx = a_sh_wr_delta * i + a_sh_wr;
+    int row = a_idx / a_gl_rd_delta_o;
+    if (row >= prob_m) {
+      a_sh_wr_pred[i] = false;
+    } else {
+      a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+    }
+  }
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];         // No act-order
+  FragS act_frag_s[2][4][4];  // For act-order
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
+                                    int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
+        int row = a_idx / a_gl_stride;
+        int sorted_row =
+            replicate_input ? sorted_ids[row] / topk : sorted_ids[row];
+        // if (expert_idx == 0) {
+        //   printf("row A: %d (%d %d), iter %d\n", row, blockIdx.x, threadIdx.x, i);
+        // }
+        int new_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
+        if (sorted_row < tot_m * (replicate_input ? 1 : topk) &&
+            new_idx < a_gl_stride * tot_m * (replicate_input ? 1 : topk)) {
+          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[new_idx],
+                         a_sh_wr_pred[i]);
+        }
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // TODO fix
+  auto fetch_sorted_ids_to_shared = [&]() {
+    const int mpt = ceildiv(prob_m, threads);
+    for (int i = 0; i < mpt; i++) {
+      if ((i * sorted_gl_stride) + threadIdx.x < prob_m) {
+        sh_sorted[(i * sorted_sh_stride) + threadIdx.x] =
+            sorted_ids[(i * sorted_gl_stride) + threadIdx.x];
+      }
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
+
+      FragB frag_b0 = dequant(b_quant);
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0);
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b0, frag_s[k % 2][j], 0);
+        }
+      }
+
+      FragB frag_b1 = dequant(b_quant_shift);
+
+      // Apply scale to frag_b1
+      if constexpr (has_act_order) {
+        scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1);
+
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b1, frag_s[k % 2][j], 1);
+        }
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          int c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+          int sorted_row = sorted_ids[c_idx / c_gl_stride];
+          // printf("row C reduce:\n");
+          // printf("row C reduce: %d (%d %d)\n", c_idx / c_gl_stride, blockIdx.x, threadIdx.x);
+          int new_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], &C[new_idx],
+                         sorted_row < tot_m * topk &&
+                             (8 * (i / 2) + row < prob_m &&
+                              (i < (thread_m_blocks - 1) * 4 ||
+                               sorted_ids[8 * (i / 2) + row] < tot_m * topk)));
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (8 * (i / 2) + row < prob_m &&
+            (i < (thread_m_blocks - 1) * 4 ||
+             sorted_ids[8 * (i / 2) + row] < tot_m * topk)) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half*>(&c)[j] =
+                  __float2half(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            int c_idx =
+                c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+            int row = sorted_ids[c_idx / c_gl_stride];
+            if (row < tot_m * topk) {
+              int new_idx = row * c_gl_stride + c_idx % c_gl_stride;
+              C[new_idx] = c;
+            }
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+
+      // For per-column quantization we finally apply the scale here
+      if constexpr (!has_act_order && group_blocks == -1) {
+        res = __hmul2(res, s[0]);
+      }
+
+      ((half2*)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        int row = sorted_ids[c_gl_wr / c_gl_stride];
+        // if (blockIdx.x == 8 && threadIdx.x == 95) {
+        //   printf("row C write: %d (%d %d)\n", c_gl_wr / c_gl_stride, blockIdx.x, threadIdx.x);
+        // }
+        if (row < tot_m * topk) {
+          int off = row * c_gl_stride + c_gl_wr % c_gl_stride;
+          if (!apply_weights) {
+            C[off] = sh[c_sh_rd];
+          } else {
+            __half* ctrg = reinterpret_cast<__half*>(&C[off]);
+            __half* csrc = reinterpret_cast<__half*>(&sh[c_sh_rd]);
+            for (int j = 0; j < 8; ++j) {
+              ctrg[j] = __float2half(topk_weights[row] * __half2float(csrc[j]));
+            }
+          }
+          c_gl_wr += c_gl_wr_delta;
+          c_sh_rd += c_sh_rd_delta;
+        }
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+    // fetch_sorted_ids_to_shared();
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // printf("slice\n");
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if (last) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if (last) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+          }
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        // TODO we deadlock here
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        }
+        start_pipes();
+      }
+    }
+  }
+}
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void MarlinMoE(
+    const int4*  __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4*  __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4*  __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids_base,      // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,  // float topk weights
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int* __restrict__ expert_offsets,
+    int num_groups,              // number of scale groups per output channel
+    int expert_idx,              // idx of current expert // TODO must decide based on offsets
+    int num_experts,             // number of experts
+    int topk,                    // topk parameter of moe
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int tot_m,                   // total number of rows in A and C
+    int* locks,            // extra global storage for barrier synchronization
+    bool replicate_input,  // do we use the same input for each expert?
+    bool apply_weights,     // apply weights to output
+    int try_m_block_ctr, // experiment
+    int* barrier_ctrs
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  int m_block_ctr = try_m_block_ctr;
+
+  constexpr int max_par = 4; // TODO should be passed as arg
+  const int* sorted_ids_expert = sorted_ids_base + expert_offsets[expert_idx] +
+                          m_block_ctr * 4 * max_par;
+  int tot_its = expert_offsets[expert_idx + 1] -
+                expert_offsets[expert_idx];
+  if (tot_its == 0) {
+    return;
+  }
+  // TODO try no padding?
+  int tot_m_blocks = ceildiv(tot_its, 16);
+  // int pad = 16 * tot_m_blocks - tot_its;
+
+  // Main loop
+  for (int m_block_ctr = 0; m_block_ctr < tot_m_blocks; m_block_ctr += 4) {
+
+  const int* sorted_ids = sorted_ids_expert;
+  // if (m_block_ctr >= tot_m_blocks) {
+  //   return;
+  // }
+
+    // int* locks = locks_base; //+ (prob_n / 64 * 16) * (m_block_ctr / 4);
+
+    int max_block = tot_m_blocks - m_block_ctr;
+    prob_m = tot_its - 16 * m_block_ctr;
+    int full_prob_m = prob_m;
+
+    // int m_offset = m_block_ctr * 16;
+    // printf("call with m_offset: %d / %d\n", m_offset, tot_its);
+
+    int par = 1;
+    if (max_block > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      // par = (16 * max_block - pad) / 64;
+      par = min((16 * max_block) / 64, max_par);
+      prob_m = 64 * par;
+      m_block_ctr += 4 * (par - 1);
+      max_block = 4;
+    }
+
+    if (max_block == 1) {
+      RunSingleIter<threads, 1, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
+                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+                        expert_offsets, num_groups, expert_idx, num_experts, topk,
+                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
+                        apply_weights, try_m_block_ctr);
+    }
+    else if (max_block == 2) {
+      RunSingleIter<threads, 2, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
+                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+                        expert_offsets, num_groups, expert_idx, num_experts, topk,
+                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
+                        apply_weights, try_m_block_ctr);
+    }
+    else if (max_block == 3) {
+      RunSingleIter<threads, 3, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
+                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+                        expert_offsets, num_groups, expert_idx, num_experts, topk,
+                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
+                        apply_weights, try_m_block_ctr);
+    }
+    else {
+      RunSingleIter<threads, 4, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
+                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+                        expert_offsets, num_groups, expert_idx, num_experts, topk,
+                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
+                        apply_weights, try_m_block_ctr);
+    }
+
+    // sorted_ids_expert += 16 * max_block * par;
+    // break;
+    // cooperative_groups::this_grid().sync();
+    // __atomic__ int ctr;
+    if (threadIdx.x == 0) {
+      printf("start bar0 %d %d %d | %d\n", barrier_ctrs[0], barrier_ctrs[1],
+             barrier_ctrs[2], gridDim.x);
+      atomicAdd(&barrier_ctrs[0], 1);
+      // if (barrier_ctrs[2] == gridDim.x) {
+      //   barrier_ctrs[2] = 0;
+      // }
+      // else {
+        while(barrier_ctrs[0] != gridDim.x);
+      // }
+      if (blockIdx.x == 0) {
+        barrier_ctrs[2] = 0;
+      }
+      printf("start bar1 %d %d %d | %d\n", barrier_ctrs[0], barrier_ctrs[1],
+             barrier_ctrs[2], gridDim.x);
+      atomicAdd(&barrier_ctrs[1], 1);
+      // if (barrier_ctrs[0] == gridDim.x) {
+      //   barrier_ctrs[0] = 0;
+      // }
+      // else {
+        while(barrier_ctrs[1] != gridDim.x);
+      // }
+      if (blockIdx.x == 0) {
+        barrier_ctrs[0] = 0;
+      }
+      printf("start bar2 %d %d %d | %d\n", barrier_ctrs[0], barrier_ctrs[1],
+             barrier_ctrs[2], gridDim.x);
+      atomicAdd(&barrier_ctrs[2], 1);
+      // if (barrier_ctrs[1] == gridDim.x) {
+      //   barrier_ctrs[1] = 0;
+      // }
+      // else {
+        while(barrier_ctrs[2] != gridDim.x);
+      // }
+      if (blockIdx.x == 0) {
+        barrier_ctrs[1] = 0;
+      }
+      printf("end bar %d\n", gridDim.x);
+    }
+
+    // barrier_acquire(&locks2[blockIdx.x], gridDim.x, 0, 0);
+    // barrier_release(&locks2[blockIdx.x], gridDim.x, 0, 0);
+
+  }
+}
+
+#endif
+
+#else
+
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
+                                  int* __restrict__ expert_offsets,
+                                  int topk_length,
+                                  int block_size) {
+    // Marlin is not implemented yet for SM < 8.0
+    assert(false);
+    return;
+}
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void MarlinMoE(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,  // float topk weights
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int* __restrict__ expert_offsets,
+    int num_groups,              // number of scale groups per output channel
+    int expert_idx,              // idx of current expert
+    int num_experts,             // number of experts
+    int topk,                    // topk parameter of moe
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int tot_m,                   // total number of rows in A and C
+    int* locks,            // extra global storage for barrier synchronization
+    bool replicate_input,  // do we use the same input for each expert?
+    bool apply_weights,     // apply weights to output
+    int try_m_block_ctr,
+    int* barrier_ctrs
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;               // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4;  // 4 pipeline stages fit into shared memory
+// const int SHARED_MEM =
+//     96 * 1024; // max shared memory on compute capability 8.6 (< 8.0)
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+#define __CALL_IF_MOE(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
+                      HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)               \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                              \
+           thread_n_blocks == THREAD_N_BLOCKS &&                              \
+           thread_k_blocks == THREAD_K_BLOCKS &&                              \
+           has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&  \
+           num_threads == NUM_THREADS) {                                      \
+    cudaFuncSetAttribute(                                                     \
+        MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,              \
+                  THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,      \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
+    MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+              STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                            \
+        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
+            A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
+            g_idx_ptr, expert_offsets2_ptr, num_groups, expert_idx,        \
+            num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,          \
+            replicate_input, apply_weights, m_block, barrier_ctrs_ptr);                                  \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},  // Default
+    {128, 64, 128},   // Reduce N 2X, same K
+    {64, 256, 256},   // Reduce K 2X, increase N 2X
+    {64, 128, 128},   // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},   // Default
+    {128, 128, 256},  // Reduce N 2X, increase K 2X
+    {64, 128, 128},   // Reduce N 2X, same K
+    {128, 64, 128},   // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF_MOE(N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                               \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+
+void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
+                         const void* sorted_ids, const void* topk_weights,
+                         const void* topk_ids,
+                         const void* s, const void* g_idx, const void* perm,
+                         void* a_tmp, void* expert_offsets, void* expert_offsets2, int prob_m,
+                         int prob_n, int prob_k, void* workspace,
+                         bool has_act_order, bool is_k_full, int num_groups,
+                         int group_size,
+                         int num_experts, int topk, int moe_block_size, int dev,
+                         cudaStream_t stream, int thread_k, int thread_n,
+                         int sms, int max_par, bool replicate_input,
+                         bool apply_weights, void* barrier_ctrs) {
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  TORCH_CHECK(is_valid_config(th_config, prob_m, prob_n, prob_k),
+              "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+                  ", thread_n = " + str(th_config.thread_n) +
+                  ", num_threads = " + str(th_config.num_threads) +
+                  " for MKN = [" + str(prob_m) + ", " + str(prob_k) + ", " +
+                  str(prob_n) + "]");
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  int blocks = sms;
+  // printf("sms: %d\n", sms);
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  int tot_m = prob_m;
+
+  #if CPU_OFFSETS
+  const long* expert_offsets_ptr = (const long*)expert_offsets;
+  int* expert_offsets2_ptr = (int*)expert_offsets2;
+  #else
+  const int* topk_ids_ptr = (const int*)topk_ids;
+  int* expert_offsets2_ptr = (int*)expert_offsets2;
+  compute_expert_offsets<<<1, num_experts, 0, stream>>>(
+      topk_ids_ptr, expert_offsets2_ptr, tot_m * topk, moe_block_size);
+  #endif
+  int* barrier_ctrs_ptr = (int*)barrier_ctrs;
+
+  bool do_permute_a = has_act_order;
+
+  // If we have a full K, then we can run the non-act-order version of Marlin
+  // (since the weight rows are reordered by increasing group ids, and by
+  // having a full K, we have full original groups)
+  if (is_k_full) {
+    has_act_order = false;
+  }
+
+  for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
+    #if CPU_OFFSETS
+    const int4* A_ptr = (const int4*)A;
+    int4* a_tmp_ptr = (int4*)a_tmp;
+    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
+    int4* C_ptr = (int4*)C;
+    const float* topk_weights_ptr = (const float*)topk_weights;
+    const int* sorted_ids_ptr =
+        (const int*)sorted_ids + expert_offsets_ptr[expert_idx];
+    const int4* s_ptr =
+        (const int4*)s +
+        (((group_size == -1 || group_size == 0) ? 1 : prob_k / group_size) *
+         prob_n / 8) *
+            expert_idx;
+
+    const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
+    const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
+    int* locks = (int*)workspace;
+
+    if (do_permute_a) {
+      // Permute A columns
+      int topk_rows = replicate_input ? tot_m : tot_m * topk;
+      int block_rows = ceildiv(topk_rows, blocks);
+      permute_cols_kernel<<<blocks, num_threads, 0, stream>>>(
+          A_ptr, perm_ptr, a_tmp_ptr, topk_rows, prob_k, block_rows);
+      A_ptr = a_tmp_ptr;
+    }
+
+    int tot_its = expert_offsets_ptr[expert_idx + 1] -
+                  expert_offsets_ptr[expert_idx];  // prob_m;
+    // printf("%d ", tot_its);
+    if (tot_its == 0) {
+      continue;
+    }
+    int tot_m_blocks = ceildiv(tot_its, 16);
+    int pad = 16 * tot_m_blocks - tot_its;
+
+    // Main loop
+    for (int i = 0; i < tot_m_blocks; i += 4) {
+      int thread_m_blocks = tot_m_blocks - i;
+      prob_m = tot_its - 16 * i;
+      int par = 1;
+      if (thread_m_blocks > 4) {
+        // Note that parallel > 1 currently only works for inputs without any
+        // padding
+        par = (16 * thread_m_blocks - pad) / 64;
+        if (par > max_par) par = max_par;
+        prob_m = 64 * par;
+        i += 4 * (par - 1);
+        thread_m_blocks = 4;
+      }
+
+      // doesn't matter for this version of the code
+      int m_block = 0;
+
+      // Define kernel configurations
+
+      if (false) {
+      }
+      CALL_IF_MOE(16, 4, 256)
+      CALL_IF_MOE(8, 8, 256)
+      CALL_IF_MOE(8, 4, 128)
+      CALL_IF_MOE(4, 8, 128)
+      else {
+        TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+                               str(prob_n) + ", " + str(prob_k) + "]" +
+                               ", has_act_order = " + str(has_act_order) +
+                               ", num_groups = " + str(num_groups) +
+                               ", group_size = " + str(group_size) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+      }
+
+      sorted_ids_ptr += 16 * thread_m_blocks * par;
+      // break;
+    }
+
+    /////
+
+    #else
+
+    /////
+
+    const int4* A_ptr = (const int4*)A;
+    int4* a_tmp_ptr = (int4*)a_tmp;
+    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
+    int4* C_ptr = (int4*)C;
+    const float* topk_weights_ptr = (const float*)topk_weights;
+    // TODO can't know expert_offsets at this point
+    const int* sorted_ids_ptr =
+        (const int*)sorted_ids;// + expert_offsets_ptr[expert_idx];
+    const int4* s_ptr =
+        (const int4*)s +
+        (((group_size == -1 || group_size == 0) ? 1 : prob_k / group_size) *
+         prob_n / 8) *
+            expert_idx;
+
+    const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
+    const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
+    int* locks = (int*)workspace;
+
+    // TODO we need an expert identifying mechanism here too
+    if (do_permute_a) {
+      // Permute A columns
+      int topk_rows = replicate_input ? tot_m : tot_m * topk;
+      int block_rows = ceildiv(topk_rows, blocks);
+      permute_cols_kernel<<<blocks, num_threads, 0, stream>>>(
+          A_ptr, perm_ptr, a_tmp_ptr, topk_rows, prob_k, block_rows);
+      A_ptr = a_tmp_ptr;
+    }
+
+    int max_m_blocks = ceildiv(tot_m, 16);
+    int m_block = 0;
+    // for (int m_block = 0; m_block < max_m_blocks; m_block += 16) {
+      // Define kernel configurations
+
+      // make it max possible value
+      int thread_m_blocks = 4;
+
+      if (false) {
+      }
+      CALL_IF_MOE(16, 4, 256)
+      CALL_IF_MOE(8, 8, 256)
+      CALL_IF_MOE(8, 4, 128)
+      CALL_IF_MOE(4, 8, 128)
+      else {
+        TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+                               str(prob_n) + ", " + str(prob_k) + "]" +
+                               ", has_act_order = " + str(has_act_order) +
+                               ", num_groups = " + str(num_groups) +
+                               ", group_size = " + str(group_size) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+      // }
+
+      // sorted_ids_ptr += 16 * thread_m_blocks * max_par;
+      // sorted_ids_ptr += 16 * thread_m_blocks * 4;
+    }
+    #endif
+  }
+  // printf("\n");
+}
+
+}  // namespace marlin_moe
+
+torch::Tensor marlin_gemm_moe(
+    const torch::Tensor& a, const torch::Tensor& b_q_weights,
+    const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
+    const torch::Tensor& topk_ids,
+    const torch::Tensor& b_scales, const torch::Tensor& g_idx,
+    const torch::Tensor& perm, const torch::Tensor& expert_offsets,
+    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+    bool is_k_full, int64_t num_experts,
+    int64_t topk, int64_t moe_block_size, bool replicate_input,
+    bool apply_weights) {
+  int max_par = 4;
+
+  int dev = a.get_device();
+
+  auto options_dtype = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  auto options_int = torch::TensorOptions().dtype(torch::kInt).device(a.device());
+  torch::Tensor c = torch::zeros({size_m, topk, size_n}, options_dtype);
+  torch::Tensor a_tmp = replicate_input
+                            ? torch::zeros({size_m, size_k}, options_dtype)
+                            : torch::zeros({size_m, topk, size_k}, options_dtype);
+  #if CPU_OFFSETS
+  torch::Tensor expert_offsets2 = torch::empty({0}, options_dtype);
+  #else
+    torch::Tensor expert_offsets2
+        = torch::empty({num_experts + 1}, options_int);
+    // torch::Tensor expert_offsets2 = torch::arange(0,
+    //         num_experts * moe_block_size, moe_block_size,
+    //         torch::TensorOptions().dtype(torch::kInt).device(a.device()));
+    // torch::Tensor expert_offsets2 = expert_offsets;
+  #endif
+  torch::Tensor barrier_ctrs = torch::zeros({3}, options_int);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+  bool has_act_order = g_idx.size(1) != 0;
+
+  int b_rank = b_scales.sizes().size();
+  TORCH_CHECK(b_rank == 3, "b_scales rank = ", b_rank, " is not 3");
+  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(1);
+
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  // std::stringstream sstream;
+  // sstream << topk_ids.dtype().name();
+  // std::string s = sstream.str();
+  // printf("topk dtype: %s\n", s.c_str());
+
+  // printf("run with %ld, %ld, %ld\n", size_m, size_n, size_k);
+
+  marlin_moe::marlin_mm_moe_f16i4(
+      a.data_ptr(), b_q_weights.data_ptr(), c.data_ptr(), sorted_ids.data_ptr(),
+      topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), expert_offsets.data_ptr(), expert_offsets2.data_ptr(), size_m,
+      size_n, size_k, workspace.data_ptr(), has_act_order, is_k_full,
+      num_groups, group_size, num_experts, topk,
+      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+      thread_n, sms, max_par, replicate_input, apply_weights, barrier_ctrs.data_ptr());
+  return c;
+}
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
new file mode 100644
index 0000000000000..a24ca32a52be7
--- /dev/null
+++ b/csrc/moe/marlin_moe_ops.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/all.h>
+
+torch::Tensor marlin_gemm_moe(
+    const torch::Tensor& a, const torch::Tensor& b_q_weights,
+    const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
+    const torch::Tensor& topk_ids,
+    const torch::Tensor& b_scales, const torch::Tensor& g_idx,
+    const torch::Tensor& perm, const torch::Tensor& expert_offsets,
+    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+    bool is_k_full, int64_t num_experts,
+    int64_t topk, int64_t moe_block_size, bool replicate_input,
+    bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 243752b9a9e8c..ca1b5c3341ef1 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -1,12 +1,25 @@
 #include "registration.h"
 #include "moe_ops.h"
+#include "marlin_moe_ops.h"
 
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
+#include <torch/library.h>
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Apply topk softmax to the gating outputs.
-  m.def(
+  ops.def(
       "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
       "token_expert_indices, Tensor gating_output) -> ()");
-  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+  ops.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+
+  ops.def(
+      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
+      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
+      "g_idx, Tensor! perm, "
+      "Tensor! expert_offsets, Tensor! workspace, int size_m, int size_n, int "
+      "size_k, bool is_k_full, int num_experts, "
+      "int topk, int moe_block_size, bool replicate_input, bool apply_weights) "
+      "-> Tensor");
+  ops.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2f9eee420f270..e73e5a518ef1a 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,13 +2,18 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+from typing import List
+
 import pytest
 import torch
 from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe import (fused_marlin_moe, fused_moe,
+                                                  single_marlin_moe)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
 
 
@@ -29,6 +34,20 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
+def torch_moe_single(a, w, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    _, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = a[mask] @ w[i].transpose(0, 1)
+    return (out.view(B, -1, w.shape[1])).sum(dim=1)
+
+
 @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -99,3 +118,194 @@ def test_mixtral_moe(dtype: torch.dtype):
                           vllm_states,
                           rtol=mixtral_moe_tol[dtype],
                           atol=mixtral_moe_tol[dtype])
+
+
+def stack_and_dev(tensors: List[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+# TODO: make sure this test works
+# @pytest.mark.skip("C compiler not installed in NM automation. "
+#                   "This codepath follows a triton pathway, which "
+#                   "JITs using clang or gcc. Since neither are installed "
+#                   "in our test instances, we need to skip this for now.")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+):
+    torch.manual_seed(7)
+
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+
+    num_bits = 4
+    dtype = torch.float16
+    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    for i in range(w2.shape[0]):
+        w2[0] = torch.eye(k, n, device='cuda', dtype=dtype)
+
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+            w1[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
+        w_ref1_l.append(w_ref1)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l)
+    sort_indices1 = stack_and_dev(sort_indices1_l)
+
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+            w2[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
+        w_ref2_l.append(w_ref2)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l)
+    sort_indices2 = stack_and_dev(sort_indices2_l)
+
+    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    triton_output = fused_moe(a,
+                              w_ref1.transpose(1, 2).contiguous(),
+                              w_ref2.transpose(1, 2).contiguous(),
+                              score,
+                              topk,
+                              renormalize=False)
+    marlin_output = fused_marlin_moe(a,
+                                     qweight1,
+                                     qweight2,
+                                     score,
+                                     g_idx1,
+                                     g_idx2,
+                                     sort_indices1,
+                                     sort_indices2,
+                                     topk,
+                                     renormalize=False,
+                                     w1_scale=scales1,
+                                     w2_scale=scales2)
+
+    assert (compute_max_diff(marlin_output, triton_output) < 4e-2)
+
+
+# TODO: make sure this test works
+# UPSTREAM SYNC: breaks NM automation.
+# @pytest.mark.skip("C compiler not installed in NM automation. "
+#                   "This codepath follows a triton pathway, which "
+#                   "JITs using clang or gcc. Since neither are installed "
+#                   "in our test instances, we need to skip this for now.")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+def test_single_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+):
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == k:
+            return
+
+    num_bits = 4
+    dtype = torch.float16
+    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    w = torch.randn((e, n, k), device='cuda', dtype=dtype) / 10
+
+    w_ref_l = []
+    qweights_l = []
+    scales_l = []
+    g_idx_l = []
+    sort_indices_l = []
+
+    for i in range(w.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+            w[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
+        w_ref_l.append(w_ref)
+        qweights_l.append(qweight)
+        scales_l.append(scales)
+        g_idx_l.append(g_idx)
+        sort_indices_l.append(sort_indices)
+
+    w_ref = stack_and_dev(w_ref_l)
+    qweight = stack_and_dev(qweights_l).contiguous()
+    scales = stack_and_dev(scales_l)
+    g_idx = stack_and_dev(g_idx_l)
+    sort_indices = stack_and_dev(sort_indices_l)
+
+    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    marlin_output = single_marlin_moe(a,
+                                      qweight,
+                                      scales,
+                                      score,
+                                      g_idx,
+                                      sort_indices,
+                                      topk,
+                                      renormalize=False)
+    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+
+    assert (compute_max_diff(marlin_output, torch_output) < 1e-2)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6cd77f75cae8d..048ab9195d24e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -279,6 +279,19 @@ def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
     return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
 
 
+def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                           size_k: int, size_n: int,
+                           num_bits: int) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    output = torch.empty((num_experts, size_k // 16, size_n * 2),
+                         device=b_q_weight.device,
+                         dtype=b_q_weight.dtype)
+    for e in range(num_experts):
+        output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e],
+                                                    size_k, size_n, num_bits)
+    return output
+
+
 def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                      b_scales: torch.Tensor, b_zeros: torch.Tensor,
                      g_idx: torch.Tensor, perm: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 3e0767c7d2665..080ecb5cfe0ba 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,3 +1,5 @@
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_marlin_moe,
+                                                            single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.triton_utils import HAS_TRITON
@@ -5,6 +7,8 @@
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
+    "fused_marlin_moe",
+    "single_marlin_moe",
 ]
 
 if HAS_TRITON:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 413c0b6d0924e..47400f06e02e0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -315,6 +315,7 @@ def get_default_config(
     K: int,
     topk: int,
     dtype: Optional[str],
+    is_marlin: bool,
 ) -> Dict[str, int]:
     config = {
         'BLOCK_SIZE_M': 64,
@@ -322,7 +323,8 @@ def get_default_config(
         'BLOCK_SIZE_K': 32,
         'GROUP_SIZE_M': 8
     }
-    if M <= E:
+    # A heuristic: fused marlin works faster with this config for small M
+    if M <= E or (is_marlin and M <= 32):
         config = {
             'BLOCK_SIZE_M': 16,
             'BLOCK_SIZE_N': 32,
@@ -339,6 +341,7 @@ def try_get_optimal_moe_config(
     dtype: Optional[str],
     M: int,
     override_config: Optional[Dict[str, Any]] = None,
+    is_marlin: bool = False,
 ):
     if override_config:
         config = override_config
@@ -353,7 +356,8 @@ def try_get_optimal_moe_config(
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Else use the default config
-            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype)
+            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
+                                        is_marlin)
     return config
 
 
@@ -622,3 +626,206 @@ def fused_moe(
                          w2_scale=w2_scale,
                          a1_scale=a1_scale,
                          a2_scale=a2_scale)
+
+
+def get_expert_offsets(sorted_token_ids: torch.Tensor, topk_ids: torch.Tensor,
+                       num_experts: int, block_size_m: int):
+    expert_offsets = [0] * (num_experts + 1)
+    occurrences = torch.bincount(topk_ids.flatten()).to(dtype=torch.int)
+    erange = min(num_experts, len(occurrences))
+    for i in range(erange):
+        ex_blocks = (occurrences[i].item() + block_size_m - 1) // block_size_m
+        expert_offsets[i + 1] = ex_blocks * block_size_m + expert_offsets[i]
+    for i in range(len(occurrences), num_experts):
+        expert_offsets[i + 1] = sorted_token_ids.size()[0]
+    return torch.as_tensor(expert_offsets)
+
+
+def single_marlin_moe(
+    hidden_states: torch.Tensor,
+    w: torch.Tensor,
+    scales: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx: torch.Tensor,
+    rand_perm: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+) -> torch.Tensor:
+    """
+    This function computes a Marlin MoE MMM using weights w
+    and top-k gating mechanism. It is meant for testing and debugging.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w (torch.Tensor): The first set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w and w2. Defaults to False.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
+    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w.is_contiguous(), "Expert weights must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    M, K = hidden_states.shape
+    E = w.shape[0]
+    N = w.shape[2] // 2
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    # This might not be an optimal config for a single MMM
+    get_config_func = functools.partial(try_get_optimal_moe_config,
+                                        w.shape,
+                                        w.shape,
+                                        topk_ids.shape[1],
+                                        "float8" if use_fp8 else None,
+                                        override_config=override_config,
+                                        is_marlin=True)
+    config = get_config_func(M)
+
+    block_size_m = config['BLOCK_SIZE_M']
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+        topk_ids, block_size_m, E)
+
+    max_workspace_size = (N // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    expert_offsets = get_expert_offsets(sorted_token_ids, topk_ids, E,
+                                        block_size_m)
+
+    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
+        g_idx, rand_perm, expert_offsets, workspace, M, N, K, True, E, topk,
+        block_size_m, True, False)
+
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+
+
+def fused_marlin_moe(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     gating_output: torch.Tensor,
+                     g_idx1: torch.Tensor,
+                     g_idx2: torch.Tensor,
+                     rand_perm1: torch.Tensor,
+                     rand_perm2: torch.Tensor,
+                     topk: int,
+                     renormalize: bool,
+                     override_config: Optional[Dict[str, Any]] = None,
+                     use_fp8: bool = False,
+                     w1_scale: Optional[torch.Tensor] = None,
+                     w2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    get_config_func = functools.partial(try_get_optimal_moe_config,
+                                        w1.shape,
+                                        w2.shape,
+                                        topk_ids.shape[1],
+                                        "float8" if use_fp8 else None,
+                                        override_config=override_config,
+                                        is_marlin=True)
+    config = get_config_func(M)
+
+    block_size_m = config['BLOCK_SIZE_M']
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+        topk_ids, block_size_m, E)
+
+    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    expert_offsets = get_expert_offsets(sorted_token_ids, topk_ids, E,
+                                        block_size_m)
+    # expert_offsets = torch.empty((0))
+
+    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+
+    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale,
+        g_idx1, rand_perm1, expert_offsets, workspace, M, 2 * N, K, True, E,
+        topk, block_size_m, True, False)
+
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+
+    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+        intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids,
+        w2_scale, g_idx2, rand_perm2, expert_offsets, workspace, M, K, N, True,
+        E, topk, block_size_m, False, True)
+
+    # intermediate_cache3 = torch.zeros((M, topk, K),
+    #                                   device=hidden_states.device,
+    #                                   dtype=hidden_states.dtype)
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index a0dc4c94744a8..564a316b4894a 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,15 +1,21 @@
+import enum
 from abc import abstractmethod
+from enum import Enum
 from typing import List, Optional, Tuple
 
 import torch
 
+from vllm import _custom_ops as ops
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_marlin_moe
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -36,6 +42,260 @@ def apply(self,
         raise NotImplementedError
 
 
+class GPTQMarlinState(Enum):
+    REPACK = enum.auto()
+    READY = enum.auto()
+
+
+class MarlinFusedMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        # Currently assuming is_k_full is always True
+        # (input size per partition is the same as full input size)
+        # Supports only sym for now (no zp)
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            scales_size2 = intermediate_size // self.quant_config.group_size
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size // self.quant_config.pack_factor,
+            2 * intermediate_size,
+            dtype=torch.int32),
+                                         requires_grad=False)
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            intermediate_size // self.quant_config.pack_factor,
+            hidden_size,
+            dtype=torch.int32),
+                                        requires_grad=False)
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(torch.empty(num_experts,
+                                                    scales_size13,
+                                                    2 * intermediate_size,
+                                                    dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(torch.empty(num_experts,
+                                                   scales_size2,
+                                                   hidden_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+        layer.marlin_state = GPTQMarlinState.REPACK
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              router_logits: torch.Tensor,
+              top_k: int,
+              renormalize: bool = True,
+              use_grouped_topk: bool = False,
+              num_expert_group: Optional[int] = None,
+              topk_group: Optional[int] = None) -> torch.Tensor:
+        if layer.marlin_state == GPTQMarlinState.REPACK:
+            layer.marlin_state = GPTQMarlinState.READY
+
+            # Newly generated tensors need to replace existing tensors that are
+            # already registered as parameters by vLLM (and won't be freed)
+            def replace_tensor(name, new_t):
+                # It is important to use resize_() here since it ensures
+                # the same buffer is reused
+                getattr(layer, name).resize_(new_t.shape)
+                getattr(layer, name).copy_(new_t)
+                del new_t
+
+            def get_scale_perms(num_bits: int):
+                scale_perm: List[int] = []
+                for i in range(8):
+                    scale_perm.extend([i + 8 * j for j in range(8)])
+                scale_perm_single: List[int] = []
+                for i in range(4):
+                    scale_perm_single.extend(
+                        [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+                return scale_perm, scale_perm_single
+
+            def marlin_permute_scales(s: torch.Tensor, size_k: int,
+                                      size_n: int, group_size: int,
+                                      num_bits: int):
+                scale_perm, scale_perm_single = get_scale_perms(num_bits)
+                if group_size < size_k and group_size != -1:
+                    s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+                else:
+                    s = s.reshape(
+                        (-1, len(scale_perm_single)))[:, scale_perm_single]
+                s = s.reshape((-1, size_n)).contiguous()
+                return s
+
+            def marlin_moe_permute_scales(s: torch.Tensor, size_k: int,
+                                          size_n: int, group_size: int,
+                                          num_bits: int):
+                num_experts = s.shape[0]
+                output = torch.empty((num_experts, s.shape[1], s.shape[2]),
+                                     device=s.device,
+                                     dtype=s.dtype)
+                for e in range(num_experts):
+                    output[e] = marlin_permute_scales(s[e], size_k, size_n,
+                                                      group_size, num_bits)
+                return output
+
+            # Process act_order
+            if self.quant_config.desc_act:
+                # Get sorting based on g_idx
+                num_experts = layer.w13_g_idx.shape[0]
+                w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+                w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+                w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+                w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+                for e in range(num_experts):
+                    w13_g_idx_sort_indices[e] = torch.argsort(
+                        layer.w13_g_idx[e]).to(torch.int32)
+                    w2_g_idx_sort_indices[e] = torch.argsort(
+                        layer.w2_g_idx[e]).to(torch.int32)
+                    w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                        w13_g_idx_sort_indices[e]]
+                    w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                        w2_g_idx_sort_indices[e]]
+                replace_tensor("w13_g_idx", w13_sorted_g_idx)
+                replace_tensor("w2_g_idx", w2_sorted_g_idx)
+                replace_tensor("w13_g_idx_sort_indices",
+                               w13_g_idx_sort_indices)
+                replace_tensor("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+            else:
+                # Reset g_idx related tensors
+                num_experts = layer.w13_g_idx.shape[0]
+                device = layer.w13_g_idx.device
+                layer.w13_g_idx = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+                layer.w2_g_idx = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+                layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+                layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+            # Repack weights
+            marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+                layer.w13_qweight,
+                layer.w13_g_idx_sort_indices,
+                layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+                layer.w13_qweight.shape[2],
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w13_qweight", marlin_w13_qweight)
+            marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+                layer.w2_qweight,
+                layer.w2_g_idx_sort_indices,
+                layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+                layer.w2_qweight.shape[2],
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w2_qweight", marlin_w2_qweight)
+            # Repack scales
+            marlin_w13_scales = marlin_moe_permute_scales(
+                layer.w13_scales,
+                x.shape[1],
+                layer.w13_scales.shape[2],
+                self.quant_config.group_size,
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w13_scales", marlin_w13_scales)
+            marlin_w2_scales = marlin_moe_permute_scales(
+                layer.w2_scales,
+                layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+                x.shape[1],
+                self.quant_config.group_size,
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w2_scales", marlin_w2_scales)
+        return fused_marlin_moe(x,
+                                layer.w13_qweight,
+                                layer.w2_qweight,
+                                router_logits,
+                                layer.w13_g_idx,
+                                layer.w2_g_idx,
+                                layer.w13_g_idx_sort_indices,
+                                layer.w2_g_idx_sort_indices,
+                                top_k,
+                                renormalize=renormalize,
+                                w1_scale=layer.w13_scales,
+                                w2_scale=layer.w2_scales)
+
+
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
@@ -178,9 +438,12 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
 
+        self.quant_method: Optional[QuantizeMethodBase] = None
+
         if quant_config is None:
-            self.quant_method: Optional[QuantizeMethodBase] = (
-                UnquantizedFusedMoEMethod())
+            self.quant_method = UnquantizedFusedMoEMethod()
+        elif isinstance(quant_config, GPTQMarlinConfig):
+            self.quant_method = MarlinFusedMoEMethod(quant_config)
         else:
             self.quant_method = quant_config.get_quant_method(self, prefix)
         assert self.quant_method is not None
@@ -193,54 +456,82 @@ def __init__(
             params_dtype=params_dtype,
             weight_loader=self.weight_loader)
 
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, weight_name: str,
-                      shard_id: int, expert_id: int):
+    def weight_loader(self,
+                      param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor,
+                      weight_name: str,
+                      shard_id: int,
+                      expert_id: int,
+                      is_quantized: bool = False):
         param_data = param.data
 
-        # Input scales can be loaded directly and should be equal.
-        if "input_scale" in weight_name:
-            if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}")
-            param_data[expert_id] = loaded_weight
-        # Weight scales
-        elif "weight_scale" in weight_name:
-            # If we are in merged column case (gate_up_proj)
-            #   shard_id 0 == gate_proj / w1
-            #   shard_id 2 == up_proj / w3
-            if shard_id == 0 or shard_id == 2:
-                # We have to keep the weight scales of w1 and w3 because
-                # we need to re-quantize w1/w3 weights after weight loading.
-                idx = 0 if shard_id == 0 else 1
-                param_data[expert_id][idx] = loaded_weight
-            # If we are in the row parallel case (down_proj)
-            #   shard_id 1 == down_proj / w2
-            else:
+        if is_quantized:
+            if "_qweight" in weight_name or "_scales" in weight_name:
+                if "w13" in weight_name:
+                    shard_size = self.intermediate_size_per_partition
+                    if shard_id == 0:
+                        param_data[expert_id, :, :shard_size] = loaded_weight
+                    elif shard_id == 1:
+                        param_data[expert_id, :, shard_size:] = loaded_weight
+                    else:
+                        raise ValueError(f"Invalid shard_id: {shard_id}: "
+                                         "must be 0 or 1.")
+                elif "w2" in weight_name:
+                    param_data[expert_id][:] = loaded_weight
+                else:
+                    raise ValueError(f"Invalid weight name: {weight_name}: "
+                                     "must contain 'w13' or 'w2'.")
+            elif "_g_idx" in weight_name:
+                if "w13" not in weight_name and "w2" not in weight_name:
+                    raise ValueError(f"Invalid weight name: {weight_name}: "
+                                     "must contain 'w13' or 'w2'.")
                 param_data[expert_id] = loaded_weight
-        # Weights
+            else:
+                raise ValueError(f"Invalid weight name: {weight_name}.")
         else:
-            tp_rank = get_tensor_model_parallel_rank()
-            shard_size = self.intermediate_size_per_partition
-            shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-
-            # w1, gate_proj case: Load into first shard of w13.
-            if shard_id == 0:
-                param_data[expert_id,
-                           0:shard_size, :] = loaded_weight[shard, :]
-            # w3, up_proj case: Load into second shard of w13.
-            elif shard_id == 2:
-                param_data[expert_id, shard_size:2 *
-                           shard_size, :] = loaded_weight[shard, :]
-            # w2, down_proj case: Load into only shard of w2.
-            elif shard_id == 1:
-                param_data[expert_id, :, :] = loaded_weight[:, shard]
+            # Input scales can be loaded directly and should be equal.
+            if "input_scale" in weight_name:
+                if param_data[expert_id] != 1 and (param_data[expert_id] -
+                                                   loaded_weight).abs() > 1e-5:
+                    raise ValueError(
+                        "input_scales of w1 and w3 of a layer "
+                        f"must be equal. But got {param_data[expert_id]} "
+                        f"vs. {loaded_weight}")
+                param_data[expert_id] = loaded_weight
+            # Weight scales
+            elif "weight_scale" in weight_name:
+                # If we are in merged column case (gate_up_proj)
+                #   shard_id 0 == gate_proj / w1
+                #   shard_id 2 == up_proj / w3
+                if shard_id == 0 or shard_id == 2:
+                    # We have to keep the weight scales of w1 and w3 because
+                    # we need to re-quantize w1/w3 weights after weight loading.
+                    idx = 0 if shard_id == 0 else 1
+                    param_data[expert_id][idx] = loaded_weight
+                # If we are in the row parallel case (down_proj)
+                #   shard_id 1 == down_proj / w2
+                else:
+                    param_data[expert_id] = loaded_weight
+            # Weights
             else:
-                raise ValueError(
-                    f"Shard id must be in [0,1,2] but got {shard_id}")
+                tp_rank = get_tensor_model_parallel_rank()
+                shard_size = self.intermediate_size_per_partition
+                shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+
+                # w1, gate_proj case: Load into first shard of w13.
+                if shard_id == 0:
+                    param_data[expert_id,
+                               0:shard_size, :] = loaded_weight[shard, :]
+                # w3, up_proj case: Load into second shard of w13.
+                elif shard_id == 2:
+                    param_data[expert_id, shard_size:2 *
+                               shard_size, :] = loaded_weight[shard, :]
+                # w2, down_proj case: Load into only shard of w2.
+                elif shard_id == 1:
+                    param_data[expert_id, :, :] = loaded_weight[:, shard]
+                else:
+                    raise ValueError(
+                        f"Shard id must be in [0,1,2] but got {shard_id}")
 
     def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 541d148c761fc..9161b2febbd17 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,6 +1,6 @@
 """Utility functions used for tests and benchmarks"""
 
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 import torch
@@ -90,8 +90,13 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
-                    act_order: bool):
+def marlin_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+    act_order: bool,
+    test_perm: Optional[torch.Tensor] = None,
+):
     size_k, size_n = w.shape
 
     # Normalize group_size
@@ -101,7 +106,7 @@ def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
-                                                       act_order)
+                                                       act_order, test_perm)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 7ade8bf664ccc..5bb38f81eb963 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
 """This file is used for /tests and /benchmarks"""
-from typing import List
+from typing import List, Optional
 
 import numpy
 import torch
@@ -49,7 +49,10 @@ def get_pack_factor(num_bits):
     return 32 // num_bits
 
 
-def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
+def permute_rows(q_w: torch.Tensor,
+                 w_ref: torch.Tensor,
+                 group_size: int,
+                 test_perm: Optional[torch.Tensor] = None):
     assert q_w.shape == w_ref.shape
 
     orig_device = q_w.device
@@ -60,7 +63,7 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
         g_idx[i] = i // group_size
 
     # Simulate act_order by doing a random permutation on K
-    rand_perm = torch.randperm(k_size)
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
 
     g_idx = g_idx[rand_perm].contiguous()
     q_w = q_w[rand_perm, :].contiguous()
@@ -74,8 +77,11 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
     )
 
 
-def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
-                     act_order: bool):
+def quantize_weights(w: torch.Tensor,
+                     num_bits: int,
+                     group_size: int,
+                     act_order: bool,
+                     test_perm: Optional[torch.Tensor] = None):
     orig_device = w.device
     size_k, size_n = w.shape
 
@@ -133,7 +139,8 @@ def reshape_w(w):
         ), "For act_order, groupsize = {} must be less than size_k = {}".format(
             group_size, size_k)
 
-        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)
+        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size,
+                                                    test_perm)
 
     return (
         w_ref.to(device=orig_device),
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 10faa5cc6b6cc..86e6e3c2b299f 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -21,6 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
+import re
 from typing import Iterable, List, Optional, Tuple
 
 import numpy as np
@@ -34,6 +35,7 @@
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
@@ -94,10 +96,13 @@ class MixtralMoE(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.config = config
+        self.use_fused_moe = use_fused_moe
+        self.quant_config = quant_config
         self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_total_experts = config.num_local_experts
@@ -113,14 +118,27 @@ def __init__(
             raise ValueError(
                 f"Rank {self.rank} has no experts assigned to it.")
 
-        self.experts = nn.ModuleList([
-            MixtralMLP(self.num_total_experts,
-                       config.hidden_size,
-                       config.intermediate_size,
-                       quant_config=quant_config)
-            if idx in self.expert_indicies else None
-            for idx in range(self.num_total_experts)
-        ])
+        if self.use_fused_moe:
+            params_dtype = torch.float16
+            self.experts = FusedMoE(num_experts=self.num_total_experts,
+                                    top_k=self.top_k,
+                                    hidden_size=config.hidden_size,
+                                    intermediate_size=config.intermediate_size,
+                                    params_dtype=params_dtype,
+                                    reduce_results=True,
+                                    renormalize=True,
+                                    quant_config=quant_config,
+                                    tp_size=self.tp_size)
+        else:
+            self.experts = nn.ModuleList([
+                MixtralMLP(self.num_total_experts,
+                           config.hidden_size,
+                           config.intermediate_size,
+                           quant_config=quant_config)
+                if idx in self.expert_indicies else None
+                for idx in range(self.num_total_experts)
+            ])
+
         self.gate = ReplicatedLinear(config.hidden_size,
                                      self.num_total_experts,
                                      bias=False,
@@ -129,31 +147,36 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-        final_hidden_states = None
-        for expert_idx in self.expert_indicies:
-            expert_layer = self.experts[expert_idx]
-            expert_mask = (selected_experts == expert_idx)
-            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
-                                                                 keepdim=True)
-
-            current_hidden_states = expert_layer(hidden_states).mul_(
-                expert_weights)
-            if final_hidden_states is None:
-                final_hidden_states = current_hidden_states
-            else:
-                final_hidden_states.add_(current_hidden_states)
-
-        return tensor_model_parallel_all_reduce(final_hidden_states).view(
-            num_tokens, hidden_dim)
+        if self.use_fused_moe:
+            ret = self.experts(hidden_states.half(), router_logits)
+            return ret.bfloat16()
+        else:
+            routing_weights = F.softmax(router_logits,
+                                        dim=1,
+                                        dtype=torch.float)
+            routing_weights, selected_experts = torch.topk(routing_weights,
+                                                           self.top_k,
+                                                           dim=-1)
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+            final_hidden_states = None
+            for expert_idx in self.expert_indicies:
+                expert_layer = self.experts[expert_idx]
+                expert_mask = (selected_experts == expert_idx)
+                expert_weights = (routing_weights * expert_mask).sum(
+                    dim=-1, keepdim=True)
+
+                current_hidden_states = expert_layer(hidden_states).mul_(
+                    expert_weights)
+                if final_hidden_states is None:
+                    final_hidden_states = current_hidden_states
+                else:
+                    final_hidden_states.add_(current_hidden_states)
+
+            return tensor_model_parallel_all_reduce(final_hidden_states).view(
+                num_tokens, hidden_dim)
 
 
 class MixtralAttention(nn.Module):
@@ -238,6 +261,7 @@ class MixtralDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -254,6 +278,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config)
         self.block_sparse_moe = MixtralMoE(config=config,
+                                           use_fused_moe=use_fused_moe,
                                            quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -294,6 +319,7 @@ class MixtralModel(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -307,6 +333,7 @@ def __init__(
         )
         self.layers = nn.ModuleList([
             MixtralDecoderLayer(config,
+                                use_fused_moe,
                                 cache_config,
                                 quant_config=quant_config)
             for _ in range(config.num_hidden_layers)
@@ -341,9 +368,21 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
+
+        # print(config)
+        # print(cache_config)
+        # print(quant_config)
+
+        # FP8 hasn't been tested. Works only with enforce-eager
+        self.use_fused_moe = True
+        #(config.torch_dtype != torch.float8_e4m3fn and
+        #config.torch_dtype != torch.float16)
+        # print("use fused?", config.torch_dtype)
+
         self.config = config
         self.quant_config = quant_config
-        self.model = MixtralModel(config, cache_config, quant_config)
+        self.model = MixtralModel(config, self.use_fused_moe, cache_config,
+                                  quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -403,11 +442,51 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip experts that are not assigned to this worker.
-                if ("block_sparse_moe.experts." in name
-                        and name not in params_dict):
-                    continue
+
+                if self.use_fused_moe:
+                    if ("block_sparse_moe.experts." in name
+                            and ".w1." not in name and ".w2." not in name
+                            and ".w3." not in name
+                            and name not in params_dict):
+                        continue
+
+                    if (".qzeros" in name):
+                        continue
+
+                    shard_id = None
+                    expert_id = 0
+
+                    has_any_numbered = (".qweight" in name or ".scales" in name
+                                        or ".g_idx" in name)
+                    if (has_any_numbered and (".w1." in name)):
+                        name = name.replace(".w1.", ".w13_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w2." in name)):
+                        name = name.replace(".w2.", ".w2_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w3." in name)):
+                        name = name.replace(".w3.", ".w13_")
+                        shard_id = 1
+
+                    exp_string = re.search(r"\.experts\.\d+.", name)
+                    if exp_string:
+                        exp_string = exp_string.group(0)
+                        expert_id = int(exp_string.split(".")[2])
+                        name = name.replace(exp_string, ".experts.")
+
+                else:
+                    if ("block_sparse_moe.experts." in name
+                            and name not in params_dict):
+                        continue
+
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+
+                if self.use_fused_moe and shard_id is not None:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, name, shard_id,
+                                  expert_id, True)
+                else:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)

From b39dba49ded86de3076755e27c9d9434d8fec0e5 Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Fri, 2 Aug 2024 12:45:44 +0000
Subject: [PATCH 002/106] clean up the CPU code

---
 csrc/moe/marlin_moe_ops.cu                    | 1527 ++---------------
 csrc/moe/marlin_moe_ops.h                     |   13 +-
 csrc/moe/torch_bindings.cpp                   |    9 +-
 .../layers/fused_moe/fused_moe.py             |   34 +-
 vllm/model_executor/models/mixtral_quant.py   |   11 +-
 5 files changed, 196 insertions(+), 1398 deletions(-)

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index ebc1693b2ba50..92184f43c9eb0 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -30,8 +30,6 @@ inline std::string str(T x) {
   return std::to_string(x);
 }
 
-#define CPU_OFFSETS true
-
 namespace marlin_moe {
 
 constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
@@ -232,1031 +230,72 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
   if (finish_row > size_m) {
     finish_row = size_m;
   }
-  int cur_block_rows = finish_row - start_row;
-
-  int row_stride = size_k * sizeof(half) / 16;
-
-  auto permute_row = [&](int row) {
-    int iters = size_k / blockDim.x;
-    int rest = size_k % blockDim.x;
-
-    int offset = row * row_stride;
-
-    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
-    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
-
-    int base_k = 0;
-
-    for (int i = 0; i < iters; i++) {
-      int cur_k = base_k + threadIdx.x;
-      int src_pos = perm_int_ptr[cur_k];
-
-      out_half[cur_k] = a_row_half[src_pos];
-
-      base_k += blockDim.x;
-    }
-
-    if (rest) {
-      if (threadIdx.x < rest) {
-        int cur_k = base_k + threadIdx.x;
-        int src_pos = perm_int_ptr[cur_k];
-
-        out_half[cur_k] = a_row_half[src_pos];
-      }
-    }
-  };
-
-  for (int i = 0; i < cur_block_rows; i++) {
-    int cur_row = start_row + i;
-    if (cur_row < size_m) {
-      permute_row(cur_row);
-    }
-  }
-}
-
-__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
-                                    int* __restrict__ expert_offsets,
-                                    int topk_length,
-                                    int block_size) {
-  int expert_id = threadIdx.x;
-  int num_experts = blockDim.x;
-
-  int occurrences = 0;
-  for (int i = 0; i < topk_length; ++i) {
-      occurrences += (topk_ids[i] == expert_id);
-  }
-  expert_offsets[expert_id + 1] = occurrences;
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    int tot_offset = 0;
-    expert_offsets[0] = 0;
-    for (int i = 0; i < num_experts; ++i) {
-      tot_offset += ceildiv(expert_offsets[i + 1], block_size) * block_size;
-      expert_offsets[i + 1] = tot_offset;
-    }
-    // for (int i = 0; i < num_experts + 1; ++i) {
-    //   printf("expert offset: %d -> %d (%d %d)\n",
-    //       i, expert_offsets[i], topk_length, block_size);
-    // }
-  }
-  __syncthreads();
-
-}
-
-#if CPU_OFFSETS
-
-template <const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const bool has_act_order,    // whether act_order is enabled
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void MarlinMoE(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
-    const float* __restrict__ topk_weights,  // float topk weights
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const int* __restrict__ g_idx,        // int32 group indices of shape k
-    const int* __restrict__ expert_offsets,
-    int num_groups,              // number of scale groups per output channel
-    int expert_idx,              // idx of current expert // TODO must decide based on offsets
-    int num_experts,             // number of experts
-    int topk,                    // topk parameter of moe
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int tot_m,                   // total number of rows in A and C
-    int* locks,            // extra global storage for barrier synchronization
-    bool replicate_input,  // do we use the same input for each expert?
-    bool apply_weights,     // apply weights to output
-    int try_m_block_ctr, // experiment
-    int* barrier_ctrs
-) {
-
-  // int tot_m_blocks = ceildiv(tot_m, 16);
-  // if (try_m_block_ctr >= tot_m_blocks) {
-  //   return;
-  // }
-
-  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
-  // same size, which might involve multiple column "slices" (of width 16 *
-  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
-  // example:
-  //   0 1 3
-  //   0 2 3
-  //   1 2 4
-  // While this kind of partitioning makes things somewhat more complicated, it
-  // ensures good utilization of all SMs for many kinds of shape and GPU
-  // configurations, while requiring as few slow global cross-threadblock
-  // reductions as possible.
-
-  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
-  // better partitioning with less reductions
-  int parallel = 1;
-  if (prob_m > 16 * thread_m_blocks) {
-    parallel = prob_m / (16 * thread_m_blocks);
-    prob_m = 16 * thread_m_blocks;
-  }
-
-  int k_tiles = prob_k / 16 / thread_k_blocks;
-  int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
-
-  if constexpr (!has_act_order && group_blocks != -1) {
-    if (group_blocks >= thread_k_blocks) {
-      // Ensure that the number of tiles in each stripe is a multiple of the
-      // groupsize; this avoids an annoying special case where a stripe starts
-      // in the middle of group.
-      iters = (group_blocks / thread_k_blocks) *
-              ceildiv(iters, (group_blocks / thread_k_blocks));
-    }
-  }
-
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
-
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    locks += (slice_col_par / n_tiles) * n_tiles;
-    slice_col = slice_col_par % n_tiles;
-    sorted_ids += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
-  }
-
-  // Compute all information about the current slice which is required for
-  // synchronization.
-  auto init_slice = [&]() {
-    slice_iters =
-        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
-    if (slice_iters == 0) return;
-    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
-    slice_count = 1;
-    slice_idx = 0;
-    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
-    if (col_first <= k_tiles * (slice_col_par + 1)) {
-      int col_off = col_first - k_tiles * slice_col_par;
-      slice_count = ceildiv(k_tiles - col_off, iters);
-      if (col_off > 0) slice_count++;
-      int delta_first = iters * blockIdx.x - col_first;
-      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
-        slice_idx = slice_count - 1;
-      else {
-        slice_idx = slice_count - 1 - delta_first / iters;
-        if (col_off > 0) slice_idx--;
-      }
-    }
-    if (slice_col == n_tiles) {
-      sorted_ids += 16 * thread_m_blocks;
-      locks += n_tiles;
-      slice_col = 0;
-    }
-  };
-  init_slice();
-
-  // A sizes/strides
-
-  // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
-  // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
-  // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
-  // between subsequent accesses within a tile
-  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory writes
-  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
-  // within a shared memory tile
-  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
-  // overall size of a tile
-  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
-  // number of shared write iterations for a tile
-  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
-
-  // B sizes/strides
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
-  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
-
-  // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
-  constexpr int s_tb_groups = !has_act_order && group_blocks < thread_k_blocks
-                                  ? thread_k_blocks / group_blocks
-                                  : 1;
-  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
-  int s_gl_rd_delta = s_gl_stride;
-  // Scale size/strides with act_order
-  constexpr int tb_k = 16 * thread_k_blocks;
-  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
-  // constexpr int act_s_row_stride      = 1;
-  // int           act_s_col_stride      = act_s_row_stride * num_groups;
-  int act_s_col_stride = 1;
-  int act_s_col_warp_stride = act_s_col_stride * 8;
-  int tb_n_warps = thread_n_blocks / 4;
-  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
-
-  constexpr int sorted_sh_stride = threads;
-  constexpr int sorted_gl_stride = threads;
-
-  // Global A read index of current thread.
-  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  a_gl_rd += a_gl_rd_delta_o * slice_row;
-  // Shared write index of current thread.
-  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  // Shared read index.
-  int a_sh_rd =
-      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
-
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
-  b_gl_rd += b_sh_stride * slice_col;
-  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
-
-  // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
-  int slice_k_start = tb_k * slice_row;
-  int slice_k_finish = slice_k_start + tb_k * slice_iters;
-  int slice_k_start_shared_fetch = slice_k_start;
-  int slice_n_offset = act_s_col_tb_stride * slice_col;
-
-  // No act_order
-  int s_gl_rd;
-  if constexpr (group_blocks == -1 || group_blocks == 0) {
-    s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-  } else {
-    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
-              s_sh_stride * slice_col + threadIdx.x;
-  }
-  int s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
-
-  // We use a different scale layout for grouped and column-wise quantization as
-  // we scale a `half2` tile in column-major layout in the former and in
-  // row-major in the latter case.
-  int s_sh_rd;
-  if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
-
-  int sh_first_group_id = -1;
-  int sh_num_groups = -1;
-  constexpr int sh_max_num_groups = 32;
-
-  int shs_size;
-  if constexpr (has_act_order)
-    shs_size = sh_max_num_groups * s_sh_stride + threads;
-  else
-    shs_size = group_blocks > 0 ? stages * s_sh_stage : threads;
-
-  extern __shared__ int4 sh[];
-  // Shared memory storage for global fetch pipelines.
-  int4* sh_a = sh;
-  int4* sh_b = sh_a + (stages * a_sh_stage);
-  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
-  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
-  int* sh_sorted = (int*)(sh_s + shs_size);
-
-  // Precompute which thread should not read memory in which iterations; this is
-  // needed if there are more threads than required for a certain tilesize or
-  // when the batchsize is not a multiple of 16.
-  bool a_sh_wr_pred[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++) {
-    int a_idx = a_sh_wr_delta * i + a_sh_wr;
-    int row = a_idx / a_gl_rd_delta_o;
-    if (row >= prob_m) {
-      a_sh_wr_pred[i] = false;
-    } else {
-      a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
-    }
-  }
-
-  // To ensure that writing and reading A tiles to/from shared memory, the
-  // latter in fragment format, is fully bank conflict free, we need to use a
-  // rather fancy XOR-based layout. The key here is that neither reads nor
-  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
-  // same shared memory banks. Further, it seems (based on NSight-Compute) that
-  // each warp must also write a consecutive memory segment?
-  auto transform_a = [&](int i) {
-    int row = i / a_gl_rd_delta_o;
-    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
-  };
-  // Since the computation of this remapping is non-trivial and, due to our main
-  // loop unrolls, all shared memory accesses are static, we simply precompute
-  // both transformed reads and writes.
-  int a_sh_wr_trans[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
-  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-    for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
-  }
-
-  // Since B-accesses have non-constant stride they have to be computed at
-  // runtime; we break dependencies between subsequent accesses with a tile by
-  // maintining multiple pointers (we have enough registers), a tiny
-  // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
-
-  // Register storage for double buffer of shared memory reads.
-  FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
-  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];         // No act-order
-  FragS act_frag_s[2][4][4];  // For act-order
-
-  // Zero accumulators.
-  auto zero_accums = [&]() {
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
-      reinterpret_cast<float*>(frag_c)[i] = 0;
-  };
-
-  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
-                                    int last_group_id) {
-    sh_first_group_id = first_group_id;
-    sh_num_groups = last_group_id - first_group_id + 1;
-
-    if (sh_num_groups < sh_max_num_groups) {
-      sh_num_groups = sh_max_num_groups;
-    }
-
-    if (sh_first_group_id + sh_num_groups > num_groups) {
-      sh_num_groups = num_groups - sh_first_group_id;
-    }
-
-    int row_offset = first_group_id * s_gl_stride;
-
-    if (is_async) {
-      for (int i = 0; i < sh_num_groups; i++) {
-        if (threadIdx.x < s_sh_stride) {
-          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
-                         &scales_ptr[row_offset + (i * s_gl_stride) +
-                                     slice_n_offset + threadIdx.x]);
-        }
-      }
-    } else {
-      for (int i = 0; i < sh_num_groups; i++) {
-        if (threadIdx.x < s_sh_stride) {
-          sh_s[(i * s_sh_stride) + threadIdx.x] =
-              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
-                         threadIdx.x];
-        }
-      }
-    }
-  };
-  // Asynchronously fetch the next A, B and s tile from global to the next
-  // shared memory pipeline location.
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
-    if (pred) {
-      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < a_sh_wr_iters; i++) {
-        int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
-        int row = a_idx / a_gl_stride;
-        int sorted_row =
-            replicate_input ? sorted_ids[row] / topk : sorted_ids[row];
-        int new_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
-        if (sorted_row < tot_m * (replicate_input ? 1 : topk) &&
-            new_idx < a_gl_stride * tot_m * (replicate_input ? 1 : topk)) {
-          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[new_idx],
-                         a_sh_wr_pred[i]);
-        }
-      }
-      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
-        B_ptr[i] += b_gl_rd_delta_o;
-      }
-
-      if constexpr (has_act_order) {
-        // Fetch g_idx thread-block portion
-        int full_pipe = a_off;
-        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
-        if (cur_k < prob_k && cur_k < slice_k_finish) {
-          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
-
-          int4 const* cur_g_idx_stage_ptr =
-              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
-
-          if (threadIdx.x < g_idx_stage) {
-            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
-                           &cur_g_idx_stage_ptr[threadIdx.x]);
-          }
-        }
-      } else {
-        if constexpr (group_blocks != -1) {
-          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          }
-        }
-      }
-    }
-    // Insert a fence even when we are winding down the pipeline to ensure that
-    // waiting is also correct at this point.
-    cp_async_fence();
-  };
-
-  // TODO fix
-  auto fetch_sorted_ids_to_shared = [&]() {
-    const int mpt = ceildiv(prob_m, threads);
-    for (int i = 0; i < mpt; i++) {
-      if ((i * sorted_gl_stride) + threadIdx.x < prob_m) {
-        sh_sorted[(i * sorted_sh_stride) + threadIdx.x] =
-            sorted_ids[(i * sorted_gl_stride) + threadIdx.x];
-      }
-    }
-  };
-
-  // Wait until the next thread tile has been loaded to shared memory.
-  auto wait_for_stage = [&]() {
-    // We only have `stages - 2` active fetches since we are double buffering
-    // and can only issue the next fetch when it is guaranteed that the previous
-    // shared memory load is fully complete (as it may otherwise be
-    // overwritten).
-    cp_async_wait<stages - 2>();
-    __syncthreads();
-  };
-
-  // Load the next sub-tile from the current location in the shared memory pipe
-  // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe) {
-    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks; i++)
-      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
-    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
-  };
-
-  bool is_same_group[stages];
-  int same_group_id[stages];
-
-  auto init_same_group = [&](int pipe) {
-    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
-    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
-
-    int group_id_1 = sh_g_idx_int_ptr[0];
-    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
-
-    is_same_group[pipe] = group_id_1 == group_id_2;
-    same_group_id[pipe] = group_id_1;
-  };
-
-  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
-    int pipe = full_pipe % stages;
-
-    if constexpr (!has_act_order) {
-      // No act-order case
-      if constexpr (group_blocks != -1) {
-        if constexpr (group_blocks >= thread_k_blocks) {
-          int4* sh_s_stage =
-              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                   (pipe / (group_blocks / thread_k_blocks)));
-          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-        } else {
-          int warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
-
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id = k_blocks / group_blocks;
-
-          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-
-          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
-              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-        }
-      }
-
-      return;
-    }
-
-    // Act-order case
-
-    // Determine K of the "current" thread-block
-    int cur_k = slice_k_start + tb_k * full_pipe;
-    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
-      return;
-    }
-
-    // Reset (to current thread-block) since we read g_idx portion from the
-    // shared memory
-    cur_k = 0;
-
-    // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-    // Determine "position" inside the thread-block (based on warp and
-    // thread-id)
-    int warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
-
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
-
-    cur_k += warp_row * 16;
-
-    int th_id = threadIdx.x % 32;
-    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
-
-    int s_col_shift =
-        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
-        (th_id / 4) * act_s_col_stride;
-
-    if (is_same_group[pipe]) {
-      if (k % 2 == 0) {
-        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
-            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
-                 s_col_shift];
-      } else {
-        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
-            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
-      }
-
-      for (int i = 1; i < 4; i++) {
-        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
-            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
-      }
-      return;
-    }
-
-    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
-    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
-
-    constexpr int k_frag_offsets[4] = {0, 1, 8,
-                                       9};  // Tensor core offsets per thread
-
-  #pragma unroll
-    for (int i = 0; i < 4; i++) {
-      int actual_k = cur_k + k_frag_offsets[i];
-
-      int group_id = sh_g_idx_int_ptr[actual_k];
-      int rel_group_id = group_id - sh_first_group_id;
-
-      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
-          sh_s[rel_group_id * s_sh_stride + s_col_shift];
-    }
-  };
-
-  // Execute the actual tensor core matmul of a sub-tile.
-  auto matmul = [&](int k) {
-  // We have the m dimension as the inner loop in order to encourage overlapping
-  // dequantization and matmul operations.
-  #pragma unroll
-    for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      int b_quant_shift = b_quant >> 8;
-
-      FragB frag_b0 = dequant(b_quant);
-
-      // Apply scale to frag_b0
-      if constexpr (has_act_order) {
-        scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
-               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0);
-      } else {
-        if constexpr (group_blocks != -1) {
-          scale(frag_b0, frag_s[k % 2][j], 0);
-        }
-      }
-
-      FragB frag_b1 = dequant(b_quant_shift);
-
-      // Apply scale to frag_b1
-      if constexpr (has_act_order) {
-        scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
-               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1);
-
-      } else {
-        if constexpr (group_blocks != -1) {
-          scale(frag_b1, frag_s[k % 2][j], 1);
-        }
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
-        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
-      }
-    }
-  };
-
-  // Since we slice across the k dimension of a tile in order to increase the
-  // number of warps while keeping the n dimension of a tile reasonable, we have
-  // multiple warps that accumulate their partial sums of the same output
-  // location; which we have to reduce over in the end. We do in shared memory.
-  auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
-    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
-
-      // Parallel logarithmic shared memory reduction. We make sure to avoid any
-      // unnecessary read or write iterations, e.g., for two warps we write only
-      // once by warp 1 and read only once by warp 0.
-
-  #pragma unroll
-      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
-  #pragma unroll
-        for (int i = red_off; i > 0; i /= 2) {
-          if (i <= red_idx && red_idx < 2 * i) {
-  #pragma unroll
-            for (int j = 0; j < 4 * 2; j++) {
-              int red_sh_wr =
-                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
-              if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
-  #pragma unroll
-                for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
-                      c_rd[k] + c_wr[k];
-              }
-              sh[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
-            }
-          }
-          __syncthreads();
-        }
-        if (red_idx == 0) {
-  #pragma unroll
-          for (int i = 0; i < 4 * 2; i++) {
-            float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
-  #pragma unroll
-            for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
-          }
-        }
-        __syncthreads();
-      }
-    }
-  };
-
-  // Since multiple threadblocks may process parts of the same column slice, we
-  // finally have to globally reduce over the results. As the striped
-  // partitioning minimizes the number of such reductions and our outputs are
-  // usually rather small, we perform this reduction serially in L2 cache.
-  auto global_reduce = [&](bool first = false, bool last = false) {
-    // We are very careful here to reduce directly in the output buffer to
-    // maximize L2 cache utilization in this step. To do this, we write out
-    // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
-    if (threadIdx.x < active_threads) {
-      int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
-      int c_gl_wr_delta_i = 4 * (active_threads / 32);
-      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
-                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
-      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
-
-      int row = (threadIdx.x % 32) / 4;
-
-      if (!first) {
-  // Interestingly, doing direct global accesses here really seems to mess up
-  // the compiler and lead to slowdowns, hence we also use async-copies even
-  // though these fetches are not actually asynchronous.
-  #pragma unroll
-        for (int i = 0; i < thread_m_blocks * 4; i++) {
-          int c_idx =
-              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
-          int sorted_row = sorted_ids[c_idx / c_gl_stride];
-          int new_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
-          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], &C[new_idx],
-                         sorted_row < tot_m * topk &&
-                             (8 * (i / 2) + row < prob_m &&
-                              (i < (thread_m_blocks - 1) * 4 ||
-                               sorted_ids[8 * (i / 2) + row] < tot_m * topk)));
-        }
-        cp_async_fence();
-        cp_async_wait<0>();
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks * 4; i++) {
-        if (8 * (i / 2) + row < prob_m &&
-            (i < (thread_m_blocks - 1) * 4 ||
-             sorted_ids[8 * (i / 2) + row] < tot_m * topk)) {
-          if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
-                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
-            }
-          }
-          if (!last) {
-            int4 c;
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<__half*>(&c)[j] =
-                  __float2half(reinterpret_cast<float*>(
-                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
-            }
-            int c_idx =
-                c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
-            int row = sorted_ids[c_idx / c_gl_stride];
-            if (row < tot_m * topk) {
-              int new_idx = row * c_gl_stride + c_idx % c_gl_stride;
-              C[new_idx] = c;
-            }
-          }
-        }
-      }
-    }
-  };
-
-  // Write out the reduce final result in the correct layout. We only actually
-  // reshuffle matrix fragments in this step, the reduction above is performed
-  // in fragment layout.
-  auto write_result = [&]() {
-    int c_gl_stride = prob_n / 8;
-    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
-    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
-    constexpr int c_sh_rd_delta =
-        c_sh_stride * (threads / (2 * thread_n_blocks));
-
-    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-    c_gl_wr += (2 * thread_n_blocks) * slice_col;
-    int c_sh_wr =
-        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-    c_sh_wr += 32 * (threadIdx.x / 32);
-    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-
-    int c_gl_wr_end = c_gl_stride * prob_m;
-
-    // We first reorder in shared memory to guarantee the most efficient final
-    // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
-      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
-
-      // For per-column quantization we finally apply the scale here
-      if constexpr (!has_act_order && group_blocks == -1) {
-        res = __hmul2(res, s[0]);
-      }
-
-      ((half2*)sh)[idx] = res;
-    };
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-        for (int j = 0; j < 4; j++) {
-          int wr = c_sh_wr + 8 * j;
-          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
-          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
-        }
-        c_sh_wr += 16 * (4 * c_sh_stride);
-      }
-    }
-    __syncthreads();
-
-  #pragma unroll
-    for (int i = 0;
-         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
-         i++) {
-      if (c_gl_wr < c_gl_wr_end) {
-        int row = sorted_ids[c_gl_wr / c_gl_stride];
-        if (row < tot_m * topk) {
-          int off = row * c_gl_stride + c_gl_wr % c_gl_stride;
-          if (!apply_weights) {
-            C[off] = sh[c_sh_rd];
-          } else {
-            __half* ctrg = reinterpret_cast<__half*>(&C[off]);
-            __half* csrc = reinterpret_cast<__half*>(&sh[c_sh_rd]);
-            for (int j = 0; j < 8; ++j) {
-              ctrg[j] = __float2half(topk_weights[row] * __half2float(csrc[j]));
-            }
-          }
-          c_gl_wr += c_gl_wr_delta;
-          c_sh_rd += c_sh_rd_delta;
-        }
-      }
-    }
-  };
+  int cur_block_rows = finish_row - start_row;
 
-  // Start global fetch and register load pipelines.
-  auto start_pipes = [&]() {
-    // fetch_sorted_ids_to_shared();
-    __syncthreads();
+  int row_stride = size_k * sizeof(half) / 16;
 
-  #pragma unroll
-    for (int i = 0; i < stages - 1; i++) {
-      if (has_act_order && i == 0) {
-        int last_g_idx = slice_k_start + stages * tb_k * 2;
-        if (last_g_idx >= prob_k) {
-          last_g_idx = prob_k - 1;
-        }
-        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
-      }
-      fetch_to_shared(i, i, i < slice_iters);
-    }
+  auto permute_row = [&](int row) {
+    int iters = size_k / blockDim.x;
+    int rest = size_k % blockDim.x;
 
-    zero_accums();
-    wait_for_stage();
-    init_same_group(0);
-    fetch_to_registers(0, 0);
-    fetch_scales_to_registers(0, 0);
-    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
-    slice_k_start_shared_fetch += tb_k * (stages - 1);
-  };
-  if (slice_iters) {
-    start_pipes();
-  }
+    int offset = row * row_stride;
 
-  // Main loop.
-  while (slice_iters) {
-    // We unroll over both the global fetch and the register load pipeline to
-    // ensure all shared memory accesses are static. Note that both pipelines
-    // have even length meaning that the next iteration will always start at
-    // index 0.
-  #pragma unroll
-    for (int pipe = 0; pipe < stages;) {
-  #pragma unroll
-      for (int k = 0; k < b_sh_wr_iters; k++) {
-        fetch_to_registers(k + 1, pipe % stages);
-        fetch_scales_to_registers(k + 1, pipe);
-        if (k == b_sh_wr_iters - 2) {
-          fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                          slice_iters >= stages);
-          pipe++;
-          wait_for_stage();
-          init_same_group(pipe % stages);
-        }
-        matmul(k);
-      }
-      slice_iters--;
-      if (slice_iters == 0) {
-        break;
-      }
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += blockDim.x;
     }
 
-    a_gl_rd += a_gl_rd_delta_o * stages;
-    slice_k_start += tb_k * stages;
-    slice_k_start_shared_fetch += tb_k * stages;
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
 
-    if constexpr (has_act_order) {
-      int first_group_id = g_idx[slice_k_start];
-      int last_g_idx = slice_k_start + stages * tb_k * 2;
-      if (last_g_idx >= prob_k) {
-        last_g_idx = prob_k - 1;
-      }
-      int last_group_id = g_idx[last_g_idx];
-      if (last_group_id >= sh_first_group_id + sh_num_groups) {
-        fetch_scales_to_shared(false, first_group_id, last_group_id);
-        __syncthreads();
+        out_half[cur_k] = a_row_half[src_pos];
       }
     }
+  };
 
-    // Process results and, if necessary, proceed to the next column slice.
-    // While this pattern may not be the most readable, other ways of writing
-    // the loop seemed to noticeably worse performance after compilation.
-    if (slice_iters == 0) {
-      cp_async_wait<0>();
-      bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
-      if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
-          if (s_sh_wr_pred) {
-            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
-          }
-          cp_async_fence();
-        }
-      }
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
 
-      thread_block_reduce();
-      if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
-          cp_async_wait<0>();
-          __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
-            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
-            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
-          }
-        }
-      }
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
-        barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
-        barrier_release(&locks[slice_col], last);
-      }
-      if (last)  // only the last block in a slice actually writes the result
-        write_result();
-      slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      init_slice();
-      if (slice_iters) {
-        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                  (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
+__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
+                                       int* __restrict__ expert_offsets,
+                                       int topk_length, int block_size) {
+  int expert_id = threadIdx.x;
+  int num_experts = blockDim.x;
 
-        // Update slice k/n for scales loading
-        if constexpr (has_act_order) {
-          slice_k_start = tb_k * slice_row;
-          slice_k_finish = slice_k_start + tb_k * slice_iters;
-          slice_k_start_shared_fetch = slice_k_start;
-          slice_n_offset = act_s_col_tb_stride * slice_col;
+  int occurrences = 0;
+  for (int i = 0; i < topk_length; ++i) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  expert_offsets[expert_id + 1] = occurrences;
+  __syncthreads();
 
-        } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-        }
-        start_pipes();
-      }
+  if (threadIdx.x == 0) {
+    int tot_offset = 0;
+    expert_offsets[0] = 0;
+    for (int i = 0; i < num_experts; ++i) {
+      tot_offset += ceildiv(expert_offsets[i + 1], block_size) * block_size;
+      expert_offsets[i + 1] = tot_offset;
     }
   }
+  __syncthreads();
 }
 
-#else
-
-// TODO could just run MarlinMoE?
 template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -1269,7 +308,7 @@ template <const int threads,          // number of threads in a threadblock
           const int group_blocks = -1  // number of consecutive 16x16 blocks
                                        // with a separate quantization scale
           >
-__device__ inline void RunSingleIter(
+__device__ inline void MarlinMoESingle(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
@@ -1279,24 +318,19 @@ __device__ inline void RunSingleIter(
                                           // (k/groupsize)xn
     const int* __restrict__ g_idx,        // int32 group indices of shape k
     const int* __restrict__ expert_offsets,
-    int num_groups,              // number of scale groups per output channel
-    int expert_idx,              // idx of current expert // TODO must decide based on offsets
-    int num_experts,             // number of experts
-    int topk,                    // topk parameter of moe
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int tot_m,                   // total number of rows in A and C
+    int num_groups,        // number of scale groups per output channel
+    int expert_idx,        // idx of current expert
+    int num_experts,       // number of experts
+    int topk,              // topk parameter of moe
+    int prob_m,            // batch dimension m
+    int prob_n,            // output dimension n
+    int prob_k,            // reduction dimension k
+    int tot_m,             // total number of rows in A and C
     int* locks,            // extra global storage for barrier synchronization
     bool replicate_input,  // do we use the same input for each expert?
-    bool apply_weights,     // apply weights to output
-    int try_m_block_ctr // experiment
+    bool apply_weights,    // apply weights to output
+    int current_m_block    // current m block to start kernel computation from
 ) {
-
-  // if (threadIdx.x == 0 && blockIdx.x == 0) {
-  //   printf("%d, %d\n", thread_m_blocks, prob_m);
-  // }
-
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
   // better partitioning with less reductions
   int parallel = 1;
@@ -1361,8 +395,6 @@ __device__ inline void RunSingleIter(
     }
     if (slice_col == n_tiles) {
       sorted_ids += 16 * thread_m_blocks;
-      // sorted_off += 16 * thread_m_blocks;
-      // printf("advance 2: %d (%d %d)\n", sorted_off, blockIdx.x, threadIdx.x);
       locks += n_tiles;
       slice_col = 0;
     }
@@ -1595,9 +627,6 @@ __device__ inline void RunSingleIter(
         int row = a_idx / a_gl_stride;
         int sorted_row =
             replicate_input ? sorted_ids[row] / topk : sorted_ids[row];
-        // if (expert_idx == 0) {
-        //   printf("row A: %d (%d %d), iter %d\n", row, blockIdx.x, threadIdx.x, i);
-        // }
         int new_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
         if (sorted_row < tot_m * (replicate_input ? 1 : topk) &&
             new_idx < a_gl_stride * tot_m * (replicate_input ? 1 : topk)) {
@@ -1656,7 +685,8 @@ __device__ inline void RunSingleIter(
     cp_async_fence();
   };
 
-  // TODO fix
+  // TODO we are currently hitting illegal memory accesses when fetching
+  // sorted_ids to shared data: fix this
   auto fetch_sorted_ids_to_shared = [&]() {
     const int mpt = ceildiv(prob_m, threads);
     for (int i = 0; i < mpt; i++) {
@@ -1933,8 +963,6 @@ __device__ inline void RunSingleIter(
           int c_idx =
               c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
           int sorted_row = sorted_ids[c_idx / c_gl_stride];
-          // printf("row C reduce:\n");
-          // printf("row C reduce: %d (%d %d)\n", c_idx / c_gl_stride, blockIdx.x, threadIdx.x);
           int new_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
           cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], &C[new_idx],
                          sorted_row < tot_m * topk &&
@@ -2040,9 +1068,6 @@ __device__ inline void RunSingleIter(
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
         int row = sorted_ids[c_gl_wr / c_gl_stride];
-        // if (blockIdx.x == 8 && threadIdx.x == 95) {
-        //   printf("row C write: %d (%d %d)\n", c_gl_wr / c_gl_stride, blockIdx.x, threadIdx.x);
-        // }
         if (row < tot_m * topk) {
           int off = row * c_gl_stride + c_gl_wr % c_gl_stride;
           if (!apply_weights) {
@@ -2063,6 +1088,7 @@ __device__ inline void RunSingleIter(
 
   // Start global fetch and register load pipelines.
   auto start_pipes = [&]() {
+    // TODO re-enable after fixing this function
     // fetch_sorted_ids_to_shared();
     __syncthreads();
 
@@ -2092,7 +1118,6 @@ __device__ inline void RunSingleIter(
 
   // Main loop.
   while (slice_iters) {
-    // printf("slice\n");
     // We unroll over both the global fetch and the register load pipeline to
     // ensure all shared memory accesses are static. Note that both pipelines
     // have even length meaning that the next iteration will always start at
@@ -2165,7 +1190,6 @@ __device__ inline void RunSingleIter(
       }
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
-        // TODO we deadlock here
         barrier_acquire(&locks[slice_col], slice_idx);
         global_reduce(slice_idx == 0, last);
         barrier_release(&locks[slice_col], last);
@@ -2216,168 +1240,89 @@ template <const int threads,          // number of threads in a threadblock
                                        // with a separate quantization scale
           >
 __global__ void MarlinMoE(
-    const int4*  __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4*  __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4*  __restrict__ C,        // fp16 output buffer of shape mxn
-    const int* __restrict__ sorted_ids_base,      // int32 sorted ids of experts
-    const float* __restrict__ topk_weights,  // float topk weights
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int* __restrict__ sorted_ids_base,  // int32 sorted ids of experts
+    const float* __restrict__ topk_weights,   // float topk weights
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const int* __restrict__ g_idx,        // int32 group indices of shape k
     const int* __restrict__ expert_offsets,
-    int num_groups,              // number of scale groups per output channel
-    int expert_idx,              // idx of current expert // TODO must decide based on offsets
-    int num_experts,             // number of experts
-    int topk,                    // topk parameter of moe
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int tot_m,                   // total number of rows in A and C
+    int num_groups,        // number of scale groups per output channel
+    int expert_idx,        // idx of current expert
+    int num_experts,       // number of experts
+    int topk,              // topk parameter of moe
+    int prob_m,            // batch dimension m
+    int prob_n,            // output dimension n
+    int prob_k,            // reduction dimension k
+    int tot_m,             // total number of rows in A and C
     int* locks,            // extra global storage for barrier synchronization
     bool replicate_input,  // do we use the same input for each expert?
-    bool apply_weights,     // apply weights to output
-    int try_m_block_ctr, // experiment
-    int* barrier_ctrs
+    bool apply_weights,    // apply weights to output
+    int current_m_block,   // current m block to start kernel computation from
+    int max_par            // maximum parallelism
 ) {
-  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
-  // same size, which might involve multiple column "slices" (of width 16 *
-  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
-  // example:
-  //   0 1 3
-  //   0 2 3
-  //   1 2 4
-  // While this kind of partitioning makes things somewhat more complicated, it
-  // ensures good utilization of all SMs for many kinds of shape and GPU
-  // configurations, while requiring as few slow global cross-threadblock
-  // reductions as possible.
-
-  int m_block_ctr = try_m_block_ctr;
-
-  constexpr int max_par = 4; // TODO should be passed as arg
-  const int* sorted_ids_expert = sorted_ids_base + expert_offsets[expert_idx] +
-                          m_block_ctr * 4 * max_par;
-  int tot_its = expert_offsets[expert_idx + 1] -
-                expert_offsets[expert_idx];
+  int m_block_ctr = current_m_block;
+
+  const int* sorted_ids_expert =
+      sorted_ids_base + expert_offsets[expert_idx] + m_block_ctr * 4 * max_par;
+  int tot_its = expert_offsets[expert_idx + 1] - expert_offsets[expert_idx];
   if (tot_its == 0) {
     return;
   }
-  // TODO try no padding?
   int tot_m_blocks = ceildiv(tot_its, 16);
-  // int pad = 16 * tot_m_blocks - tot_its;
-
-  // Main loop
-  for (int m_block_ctr = 0; m_block_ctr < tot_m_blocks; m_block_ctr += 4) {
-
-  const int* sorted_ids = sorted_ids_expert;
-  // if (m_block_ctr >= tot_m_blocks) {
-  //   return;
-  // }
-
-    // int* locks = locks_base; //+ (prob_n / 64 * 16) * (m_block_ctr / 4);
-
-    int max_block = tot_m_blocks - m_block_ctr;
-    prob_m = tot_its - 16 * m_block_ctr;
-    int full_prob_m = prob_m;
-
-    // int m_offset = m_block_ctr * 16;
-    // printf("call with m_offset: %d / %d\n", m_offset, tot_its);
-
-    int par = 1;
-    if (max_block > 4) {
-      // Note that parallel > 1 currently only works for inputs without any
-      // padding
-      // par = (16 * max_block - pad) / 64;
-      par = min((16 * max_block) / 64, max_par);
-      prob_m = 64 * par;
-      m_block_ctr += 4 * (par - 1);
-      max_block = 4;
-    }
-
-    if (max_block == 1) {
-      RunSingleIter<threads, 1, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
-                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
-                        expert_offsets, num_groups, expert_idx, num_experts, topk,
-                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
-                        apply_weights, try_m_block_ctr);
-    }
-    else if (max_block == 2) {
-      RunSingleIter<threads, 2, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
-                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
-                        expert_offsets, num_groups, expert_idx, num_experts, topk,
-                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
-                        apply_weights, try_m_block_ctr);
-    }
-    else if (max_block == 3) {
-      RunSingleIter<threads, 3, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
-                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
-                        expert_offsets, num_groups, expert_idx, num_experts, topk,
-                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
-                        apply_weights, try_m_block_ctr);
-    }
-    else {
-      RunSingleIter<threads, 4, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
-                        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
-                        expert_offsets, num_groups, expert_idx, num_experts, topk,
-                        prob_m, prob_n, prob_k, tot_m, locks, replicate_input,
-                        apply_weights, try_m_block_ctr);
-    }
+  int pad = 16 * tot_m_blocks - tot_its;
 
-    // sorted_ids_expert += 16 * max_block * par;
-    // break;
-    // cooperative_groups::this_grid().sync();
-    // __atomic__ int ctr;
-    if (threadIdx.x == 0) {
-      printf("start bar0 %d %d %d | %d\n", barrier_ctrs[0], barrier_ctrs[1],
-             barrier_ctrs[2], gridDim.x);
-      atomicAdd(&barrier_ctrs[0], 1);
-      // if (barrier_ctrs[2] == gridDim.x) {
-      //   barrier_ctrs[2] = 0;
-      // }
-      // else {
-        while(barrier_ctrs[0] != gridDim.x);
-      // }
-      if (blockIdx.x == 0) {
-        barrier_ctrs[2] = 0;
-      }
-      printf("start bar1 %d %d %d | %d\n", barrier_ctrs[0], barrier_ctrs[1],
-             barrier_ctrs[2], gridDim.x);
-      atomicAdd(&barrier_ctrs[1], 1);
-      // if (barrier_ctrs[0] == gridDim.x) {
-      //   barrier_ctrs[0] = 0;
-      // }
-      // else {
-        while(barrier_ctrs[1] != gridDim.x);
-      // }
-      if (blockIdx.x == 0) {
-        barrier_ctrs[0] = 0;
-      }
-      printf("start bar2 %d %d %d | %d\n", barrier_ctrs[0], barrier_ctrs[1],
-             barrier_ctrs[2], gridDim.x);
-      atomicAdd(&barrier_ctrs[2], 1);
-      // if (barrier_ctrs[1] == gridDim.x) {
-      //   barrier_ctrs[1] = 0;
-      // }
-      // else {
-        while(barrier_ctrs[2] != gridDim.x);
-      // }
-      if (blockIdx.x == 0) {
-        barrier_ctrs[1] = 0;
-      }
-      printf("end bar %d\n", gridDim.x);
-    }
+  if (m_block_ctr >= tot_m_blocks) {
+    return;
+  }
 
-    // barrier_acquire(&locks2[blockIdx.x], gridDim.x, 0, 0);
-    // barrier_release(&locks2[blockIdx.x], gridDim.x, 0, 0);
+  int max_block = tot_m_blocks - m_block_ctr;
+  prob_m = tot_its - 16 * m_block_ctr;
+
+  int par = 1;
+  if (max_block > 4) {
+    // Note that parallel > 1 currently only works for inputs without any
+    // padding
+    par = (16 * max_block - pad) / 64;
+    par = min((16 * max_block - pad) / 64, max_par);
+    prob_m = 64 * par;
+    m_block_ctr += 4 * (par - 1);
+    max_block = 4;
+  }
 
+  if (max_block == 1) {
+    MarlinMoESingle<threads, 1, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
+  } else if (max_block == 2) {
+    MarlinMoESingle<threads, 2, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
+  } else if (max_block == 3) {
+    MarlinMoESingle<threads, 3, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
+  } else {
+    MarlinMoESingle<threads, 4, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
+        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
+        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
+        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
+        current_m_block);
   }
 }
 
-#endif
-
 #else
 
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
@@ -2390,12 +1335,11 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
 }
 
 __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
-                                  int* __restrict__ expert_offsets,
-                                  int topk_length,
-                                  int block_size) {
-    // Marlin is not implemented yet for SM < 8.0
-    assert(false);
-    return;
+                                       int* __restrict__ expert_offsets,
+                                       int topk_length, int block_size) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
 }
 
 template <const int threads,          // number of threads in a threadblock
@@ -2420,19 +1364,19 @@ __global__ void MarlinMoE(
                                           // (k/groupsize)xn
     const int* __restrict__ g_idx,        // int32 group indices of shape k
     const int* __restrict__ expert_offsets,
-    int num_groups,              // number of scale groups per output channel
-    int expert_idx,              // idx of current expert
-    int num_experts,             // number of experts
-    int topk,                    // topk parameter of moe
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int tot_m,                   // total number of rows in A and C
+    int num_groups,        // number of scale groups per output channel
+    int expert_idx,        // idx of current expert
+    int num_experts,       // number of experts
+    int topk,              // topk parameter of moe
+    int prob_m,            // batch dimension m
+    int prob_n,            // output dimension n
+    int prob_k,            // reduction dimension k
+    int tot_m,             // total number of rows in A and C
     int* locks,            // extra global storage for barrier synchronization
     bool replicate_input,  // do we use the same input for each expert?
-    bool apply_weights,     // apply weights to output
-    int try_m_block_ctr,
-    int* barrier_ctrs
+    bool apply_weights,    // apply weights to output
+    int current_m_block,   // current m block to start kernel computation from
+    int max_par            // maximum parallelism
 ) {
   // Marlin is not implemented yet for SM < 8.0
   assert(false);
@@ -2468,9 +1412,9 @@ static constexpr int min_thread_k = 64;
               STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                            \
         <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
             A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
-            g_idx_ptr, expert_offsets2_ptr, num_groups, expert_idx,        \
+            g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,            \
             num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,          \
-            replicate_input, apply_weights, m_block, barrier_ctrs_ptr);                                  \
+            replicate_input, apply_weights, m_block, max_par);                \
   }
 
 typedef struct {
@@ -2578,16 +1522,14 @@ thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
 
 void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                          const void* sorted_ids, const void* topk_weights,
-                         const void* topk_ids,
-                         const void* s, const void* g_idx, const void* perm,
-                         void* a_tmp, void* expert_offsets, void* expert_offsets2, int prob_m,
-                         int prob_n, int prob_k, void* workspace,
+                         const void* topk_ids, const void* s, const void* g_idx,
+                         const void* perm, void* a_tmp, void* expert_offsets,
+                         int prob_m, int prob_n, int prob_k, void* workspace,
                          bool has_act_order, bool is_k_full, int num_groups,
-                         int group_size,
-                         int num_experts, int topk, int moe_block_size, int dev,
-                         cudaStream_t stream, int thread_k, int thread_n,
-                         int sms, int max_par, bool replicate_input,
-                         bool apply_weights, void* barrier_ctrs) {
+                         int group_size, int num_experts, int topk,
+                         int moe_block_size, int dev, cudaStream_t stream,
+                         int thread_k, int thread_n, int sms, int max_par,
+                         bool replicate_input, bool apply_weights) {
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -2620,7 +1562,6 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
   int thread_n_blocks = thread_n / 16;
 
   int blocks = sms;
-  // printf("sms: %d\n", sms);
 
   TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
               " is not divisible by thread_n = ", thread_n);
@@ -2656,16 +1597,10 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
 
   int tot_m = prob_m;
 
-  #if CPU_OFFSETS
-  const long* expert_offsets_ptr = (const long*)expert_offsets;
-  int* expert_offsets2_ptr = (int*)expert_offsets2;
-  #else
   const int* topk_ids_ptr = (const int*)topk_ids;
-  int* expert_offsets2_ptr = (int*)expert_offsets2;
+  int* expert_offsets_ptr = (int*)expert_offsets;
   compute_expert_offsets<<<1, num_experts, 0, stream>>>(
-      topk_ids_ptr, expert_offsets2_ptr, tot_m * topk, moe_block_size);
-  #endif
-  int* barrier_ctrs_ptr = (int*)barrier_ctrs;
+      topk_ids_ptr, expert_offsets_ptr, tot_m * topk, moe_block_size);
 
   bool do_permute_a = has_act_order;
 
@@ -2677,108 +1612,21 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
   }
 
   for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
-    #if CPU_OFFSETS
-    const int4* A_ptr = (const int4*)A;
-    int4* a_tmp_ptr = (int4*)a_tmp;
-    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
-    int4* C_ptr = (int4*)C;
-    const float* topk_weights_ptr = (const float*)topk_weights;
-    const int* sorted_ids_ptr =
-        (const int*)sorted_ids + expert_offsets_ptr[expert_idx];
-    const int4* s_ptr =
-        (const int4*)s +
-        (((group_size == -1 || group_size == 0) ? 1 : prob_k / group_size) *
-         prob_n / 8) *
-            expert_idx;
-
-    const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
-    const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
-    int* locks = (int*)workspace;
-
-    if (do_permute_a) {
-      // Permute A columns
-      int topk_rows = replicate_input ? tot_m : tot_m * topk;
-      int block_rows = ceildiv(topk_rows, blocks);
-      permute_cols_kernel<<<blocks, num_threads, 0, stream>>>(
-          A_ptr, perm_ptr, a_tmp_ptr, topk_rows, prob_k, block_rows);
-      A_ptr = a_tmp_ptr;
-    }
-
-    int tot_its = expert_offsets_ptr[expert_idx + 1] -
-                  expert_offsets_ptr[expert_idx];  // prob_m;
-    // printf("%d ", tot_its);
-    if (tot_its == 0) {
-      continue;
-    }
-    int tot_m_blocks = ceildiv(tot_its, 16);
-    int pad = 16 * tot_m_blocks - tot_its;
-
-    // Main loop
-    for (int i = 0; i < tot_m_blocks; i += 4) {
-      int thread_m_blocks = tot_m_blocks - i;
-      prob_m = tot_its - 16 * i;
-      int par = 1;
-      if (thread_m_blocks > 4) {
-        // Note that parallel > 1 currently only works for inputs without any
-        // padding
-        par = (16 * thread_m_blocks - pad) / 64;
-        if (par > max_par) par = max_par;
-        prob_m = 64 * par;
-        i += 4 * (par - 1);
-        thread_m_blocks = 4;
-      }
-
-      // doesn't matter for this version of the code
-      int m_block = 0;
-
-      // Define kernel configurations
-
-      if (false) {
-      }
-      CALL_IF_MOE(16, 4, 256)
-      CALL_IF_MOE(8, 8, 256)
-      CALL_IF_MOE(8, 4, 128)
-      CALL_IF_MOE(4, 8, 128)
-      else {
-        TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
-                               str(prob_n) + ", " + str(prob_k) + "]" +
-                               ", has_act_order = " + str(has_act_order) +
-                               ", num_groups = " + str(num_groups) +
-                               ", group_size = " + str(group_size) +
-                               ", thread_m_blocks = " + str(thread_m_blocks) +
-                               ", thread_n_blocks = " + str(thread_n_blocks) +
-                               ", thread_k_blocks = " + str(thread_k_blocks));
-      }
-
-      sorted_ids_ptr += 16 * thread_m_blocks * par;
-      // break;
-    }
-
-    /////
-
-    #else
-
-    /////
-
     const int4* A_ptr = (const int4*)A;
     int4* a_tmp_ptr = (int4*)a_tmp;
     const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
     int4* C_ptr = (int4*)C;
     const float* topk_weights_ptr = (const float*)topk_weights;
-    // TODO can't know expert_offsets at this point
-    const int* sorted_ids_ptr =
-        (const int*)sorted_ids;// + expert_offsets_ptr[expert_idx];
+    const int* sorted_ids_ptr = (const int*)sorted_ids;
     const int4* s_ptr =
         (const int4*)s +
         (((group_size == -1 || group_size == 0) ? 1 : prob_k / group_size) *
          prob_n / 8) *
             expert_idx;
-
     const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
     const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
     int* locks = (int*)workspace;
 
-    // TODO we need an expert identifying mechanism here too
     if (do_permute_a) {
       // Permute A columns
       int topk_rows = replicate_input ? tot_m : tot_m * topk;
@@ -2789,8 +1637,7 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
     }
 
     int max_m_blocks = ceildiv(tot_m, 16);
-    int m_block = 0;
-    // for (int m_block = 0; m_block < max_m_blocks; m_block += 16) {
+    for (int m_block = 0; m_block < max_m_blocks; m_block += 16) {
       // Define kernel configurations
 
       // make it max possible value
@@ -2811,14 +1658,9 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                                ", thread_m_blocks = " + str(thread_m_blocks) +
                                ", thread_n_blocks = " + str(thread_n_blocks) +
                                ", thread_k_blocks = " + str(thread_k_blocks));
-      // }
-
-      // sorted_ids_ptr += 16 * thread_m_blocks * max_par;
-      // sorted_ids_ptr += 16 * thread_m_blocks * 4;
+      }
     }
-    #endif
   }
-  // printf("\n");
 }
 
 }  // namespace marlin_moe
@@ -2826,34 +1668,24 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
 torch::Tensor marlin_gemm_moe(
     const torch::Tensor& a, const torch::Tensor& b_q_weights,
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
-    const torch::Tensor& topk_ids,
-    const torch::Tensor& b_scales, const torch::Tensor& g_idx,
-    const torch::Tensor& perm, const torch::Tensor& expert_offsets,
+    const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
+    const torch::Tensor& g_idx, const torch::Tensor& perm,
     torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts,
-    int64_t topk, int64_t moe_block_size, bool replicate_input,
-    bool apply_weights) {
+    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    bool replicate_input, bool apply_weights) {
   int max_par = 4;
 
   int dev = a.get_device();
 
-  auto options_dtype = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  auto options_int = torch::TensorOptions().dtype(torch::kInt).device(a.device());
+  auto options_dtype =
+      torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt).device(a.device());
   torch::Tensor c = torch::zeros({size_m, topk, size_n}, options_dtype);
-  torch::Tensor a_tmp = replicate_input
-                            ? torch::zeros({size_m, size_k}, options_dtype)
-                            : torch::zeros({size_m, topk, size_k}, options_dtype);
-  #if CPU_OFFSETS
-  torch::Tensor expert_offsets2 = torch::empty({0}, options_dtype);
-  #else
-    torch::Tensor expert_offsets2
-        = torch::empty({num_experts + 1}, options_int);
-    // torch::Tensor expert_offsets2 = torch::arange(0,
-    //         num_experts * moe_block_size, moe_block_size,
-    //         torch::TensorOptions().dtype(torch::kInt).device(a.device()));
-    // torch::Tensor expert_offsets2 = expert_offsets;
-  #endif
-  torch::Tensor barrier_ctrs = torch::zeros({3}, options_int);
+  torch::Tensor a_tmp =
+      replicate_input ? torch::zeros({size_m, size_k}, options_dtype)
+                      : torch::zeros({size_m, topk, size_k}, options_dtype);
+  torch::Tensor expert_offsets = torch::empty({num_experts + 1}, options_int);
 
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
@@ -2896,20 +1728,13 @@ torch::Tensor marlin_gemm_moe(
     }
   }
 
-  // std::stringstream sstream;
-  // sstream << topk_ids.dtype().name();
-  // std::string s = sstream.str();
-  // printf("topk dtype: %s\n", s.c_str());
-
-  // printf("run with %ld, %ld, %ld\n", size_m, size_n, size_k);
-
   marlin_moe::marlin_mm_moe_f16i4(
       a.data_ptr(), b_q_weights.data_ptr(), c.data_ptr(), sorted_ids.data_ptr(),
-      topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(), g_idx.data_ptr(),
-      perm.data_ptr(), a_tmp.data_ptr(), expert_offsets.data_ptr(), expert_offsets2.data_ptr(), size_m,
-      size_n, size_k, workspace.data_ptr(), has_act_order, is_k_full,
-      num_groups, group_size, num_experts, topk,
+      topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
+      g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
+      expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
+      has_act_order, is_k_full, num_groups, group_size, num_experts, topk,
       moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
-      thread_n, sms, max_par, replicate_input, apply_weights, barrier_ctrs.data_ptr());
+      thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
 }
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index a24ca32a52be7..78e2f5d346652 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -5,10 +5,9 @@
 torch::Tensor marlin_gemm_moe(
     const torch::Tensor& a, const torch::Tensor& b_q_weights,
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
-    const torch::Tensor& topk_ids,
-    const torch::Tensor& b_scales, const torch::Tensor& g_idx,
-    const torch::Tensor& perm, const torch::Tensor& expert_offsets,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts,
-    int64_t topk, int64_t moe_block_size, bool replicate_input,
-    bool apply_weights);
+    const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
+    const torch::Tensor& g_idx, const torch::Tensor& perm,
+    const torch::Tensor& expert_offsets, torch::Tensor& workspace,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index ca1b5c3341ef1..0b4b92e16f7b4 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -14,11 +14,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "g_idx, Tensor! perm, "
-      "Tensor! expert_offsets, Tensor! workspace, int size_m, int size_n, int "
-      "size_k, bool is_k_full, int num_experts, "
-      "int topk, int moe_block_size, bool replicate_input, bool apply_weights) "
-      "-> Tensor");
+      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
+      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
+      "bool replicate_input, bool apply_weights) -> Tensor");
+
   ops.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 }
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 47400f06e02e0..3774a442f9180 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -384,6 +384,8 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
+    from pprint import pprint
+    pprint(vars(ops))
     ops.topk_softmax(
         topk_weights,
         topk_ids,
@@ -628,19 +630,6 @@ def fused_moe(
                          a2_scale=a2_scale)
 
 
-def get_expert_offsets(sorted_token_ids: torch.Tensor, topk_ids: torch.Tensor,
-                       num_experts: int, block_size_m: int):
-    expert_offsets = [0] * (num_experts + 1)
-    occurrences = torch.bincount(topk_ids.flatten()).to(dtype=torch.int)
-    erange = min(num_experts, len(occurrences))
-    for i in range(erange):
-        ex_blocks = (occurrences[i].item() + block_size_m - 1) // block_size_m
-        expert_offsets[i + 1] = ex_blocks * block_size_m + expert_offsets[i]
-    for i in range(len(occurrences), num_experts):
-        expert_offsets[i + 1] = sorted_token_ids.size()[0]
-    return torch.as_tensor(expert_offsets)
-
-
 def single_marlin_moe(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
@@ -712,13 +701,10 @@ def single_marlin_moe(
                             device="cuda",
                             requires_grad=False)
 
-    expert_offsets = get_expert_offsets(sorted_token_ids, topk_ids, E,
-                                        block_size_m)
-
     intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
         hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        g_idx, rand_perm, expert_offsets, workspace, M, N, K, True, E, topk,
-        block_size_m, True, False)
+        g_idx, rand_perm, workspace, M, N, K, True, E, topk, block_size_m,
+        True, False)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
@@ -804,25 +790,21 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
                             device="cuda",
                             requires_grad=False)
 
-    expert_offsets = get_expert_offsets(sorted_token_ids, topk_ids, E,
-                                        block_size_m)
-    # expert_offsets = torch.empty((0))
-
     intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N),
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
 
     intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
         hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale,
-        g_idx1, rand_perm1, expert_offsets, workspace, M, 2 * N, K, True, E,
-        topk, block_size_m, True, False)
+        g_idx1, rand_perm1, workspace, M, 2 * N, K, True, E, topk,
+        block_size_m, True, False)
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
     intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
         intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids,
-        w2_scale, g_idx2, rand_perm2, expert_offsets, workspace, M, K, N, True,
-        E, topk, block_size_m, False, True)
+        w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk,
+        block_size_m, False, True)
 
     # intermediate_cache3 = torch.zeros((M, topk, K),
     #                                   device=hidden_states.device,
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 86e6e3c2b299f..85dafd55bbcf8 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -369,15 +369,8 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        # print(config)
-        # print(cache_config)
-        # print(quant_config)
-
-        # FP8 hasn't been tested. Works only with enforce-eager
-        self.use_fused_moe = True
-        #(config.torch_dtype != torch.float8_e4m3fn and
-        #config.torch_dtype != torch.float16)
-        # print("use fused?", config.torch_dtype)
+        # TODO check runs with dtype=float16
+        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
 
         self.config = config
         self.quant_config = quant_config

From b0c4671f706fcf25fe24c3c025aec6e60f9c8b71 Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Fri, 2 Aug 2024 15:26:51 +0000
Subject: [PATCH 003/106] Fix build issues

---
 csrc/moe/marlin_moe_ops.h                         |  5 ++---
 csrc/moe/torch_bindings.cpp                       | 12 +++++-------
 vllm/model_executor/layers/fused_moe/fused_moe.py | 12 +++---------
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index 78e2f5d346652..43d264e0770d6 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -7,7 +7,6 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    const torch::Tensor& expert_offsets, torch::Tensor& workspace,
-    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
-    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 0b4b92e16f7b4..0ed9b1f64590a 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -2,23 +2,21 @@
 #include "moe_ops.h"
 #include "marlin_moe_ops.h"
 
-#include <torch/library.h>
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   // Apply topk softmax to the gating outputs.
-  ops.def(
+  m.def(
       "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
       "token_expert_indices, Tensor gating_output) -> ()");
-  ops.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
 
-  ops.def(
+  m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
       "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
       "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
       "bool replicate_input, bool apply_weights) -> Tensor");
 
-  ops.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
+  m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3774a442f9180..64e47ad803232 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -384,8 +384,7 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
-    from pprint import pprint
-    pprint(vars(ops))
+
     ops.topk_softmax(
         topk_weights,
         topk_ids,
@@ -692,8 +691,7 @@ def single_marlin_moe(
 
     block_size_m = config['BLOCK_SIZE_M']
 
-    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, block_size_m, E)
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
     max_workspace_size = (N // 64) * 16
     workspace = torch.zeros(max_workspace_size,
@@ -781,8 +779,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
 
     block_size_m = config['BLOCK_SIZE_M']
 
-    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, block_size_m, E)
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
     max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
     workspace = torch.zeros(max_workspace_size,
@@ -806,8 +803,5 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk,
         block_size_m, False, True)
 
-    # intermediate_cache3 = torch.zeros((M, topk, K),
-    #                                   device=hidden_states.device,
-    #                                   dtype=hidden_states.dtype)
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
                      dim=1)

From e5c1a8131c970fbb42540b518c8e37d3d0b150e8 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 6 Aug 2024 17:20:26 -0700
Subject: [PATCH 004/106] Refactoring for maintainability

---
 .../layers/fused_moe/__init__.py              |  18 +-
 .../layers/fused_moe/fused_moe.py             | 102 +---
 .../layers/fused_moe/fused_moe_gptq.py        | 138 +++++
 vllm/model_executor/layers/fused_moe/layer.py | 482 ++++++------------
 .../layers/quantization/gptq_marlin.py        | 356 ++++++++++++-
 vllm/model_executor/models/mixtral_quant.py   | 144 ++----
 6 files changed, 665 insertions(+), 575 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/fused_moe_gptq.py

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 080ecb5cfe0ba..2b982b7ab9f86 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,21 +1,23 @@
-from vllm.model_executor.layers.fused_moe.fused_moe import (fused_marlin_moe,
-                                                            single_marlin_moe)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe.fused_moe_gptq import fused_moe_gptq
+from vllm.model_executor.layers.fused_moe.fused_moe import single_marlin_moe
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
 from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
-    "fused_marlin_moe",
+    "fused_moe_gptq",
     "single_marlin_moe",
 ]
 
 if HAS_TRITON:
-
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        fused_experts, fused_moe, fused_topk, get_config_file_name,
-        grouped_topk)
+        fused_experts,
+        fused_moe,
+        fused_topk,
+        get_config_file_name,
+        grouped_topk,
+    )
 
     __all__ += [
         "fused_moe",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 64e47ad803232..9ae5859c4da0c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -704,104 +704,4 @@ def single_marlin_moe(
         g_idx, rand_perm, workspace, M, N, K, True, E, topk, block_size_m,
         True, False)
 
-    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
-
-
-def fused_marlin_moe(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     gating_output: torch.Tensor,
-                     g_idx1: torch.Tensor,
-                     g_idx2: torch.Tensor,
-                     rand_perm1: torch.Tensor,
-                     rand_perm2: torch.Tensor,
-                     topk: int,
-                     renormalize: bool,
-                     override_config: Optional[Dict[str, Any]] = None,
-                     use_fp8: bool = False,
-                     w1_scale: Optional[torch.Tensor] = None,
-                     w2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    This function computes a Mixture of Experts (MoE) layer using two sets of
-    weights, w1 and w2, and top-k gating mechanism.
-
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w1 (torch.Tensor): The first set of expert weights.
-    - w2 (torch.Tensor): The second set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk (int): The number of top-k experts to select.
-    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override
-        for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
-    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w1.
-    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w2.
-
-    Returns:
-    - torch.Tensor: The output tensor after applying the MoE layer.
-    """
-    # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
-    assert hidden_states.shape[
-        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    M, K = hidden_states.shape
-    E = w1.shape[0]
-    N = w2.shape[1] * 16
-
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
-
-    get_config_func = functools.partial(try_get_optimal_moe_config,
-                                        w1.shape,
-                                        w2.shape,
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
-                                        override_config=override_config,
-                                        is_marlin=True)
-    config = get_config_func(M)
-
-    block_size_m = config['BLOCK_SIZE_M']
-
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
-
-    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device="cuda",
-                            requires_grad=False)
-
-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-
-    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale,
-        g_idx1, rand_perm1, workspace, M, 2 * N, K, True, E, topk,
-        block_size_m, True, False)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-
-    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
-        intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids,
-        w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk,
-        block_size_m, False, True)
-
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py b/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py
new file mode 100644
index 0000000000000..15c11fc0b668e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py
@@ -0,0 +1,138 @@
+"""Fused MoE utilities for GPTQ."""
+import functools
+import torch
+
+from typing import Any, Dict, Optional
+from vllm import _custom_ops as ops
+
+from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+
+
+def fused_moe_gptq(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx1: torch.Tensor,
+    g_idx2: torch.Tensor,
+    rand_perm1: torch.Tensor,
+    rand_perm2: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, renormalize)
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        "float8" if use_fp8 else None,
+        override_config=override_config,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+
+    block_size_m = config["BLOCK_SIZE_M"]
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    workspace = torch.zeros(
+        max_workspace_size, dtype=torch.int, device="cuda", requires_grad=False
+    )
+
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states,
+        w1,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        g_idx1,
+        rand_perm1,
+        workspace,
+        M,
+        2 * N,
+        K,
+        True,
+        E,
+        topk,
+        block_size_m,
+        True,
+        False,
+    )
+
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+
+    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+        intermediate_cache2,
+        w2,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w2_scale,
+        g_idx2,
+        rand_perm2,
+        workspace,
+        M,
+        K,
+        N,
+        True,
+        E,
+        topk,
+        block_size_m,
+        False,
+        True,
+    )
+
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 564a316b4894a..913d6a93b0cd5 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -6,16 +6,17 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_marlin_moe
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinConfig)
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -24,300 +25,63 @@
 class FusedMoEMethodBase(QuantizeMethodBase):
 
     @abstractmethod
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         raise NotImplementedError
 
     @abstractmethod
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              router_logits: torch.Tensor,
-              top_k: int,
-              renormalize: bool = True,
-              use_grouped_topk: bool = False,
-              num_expert_group: Optional[int] = None,
-              topk_group: Optional[int] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
 
-class GPTQMarlinState(Enum):
-    REPACK = enum.auto()
-    READY = enum.auto()
-
-
-class MarlinFusedMoEMethod(FusedMoEMethodBase):
-    """MoE Marlin method with quantization."""
-
-    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
-        self.quant_config = quant_config
-
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-        # Currently assuming is_k_full is always True
-        # (input size per partition is the same as full input size)
-        # Supports only sym for now (no zp)
-        if self.quant_config.group_size != -1:
-            scales_size13 = hidden_size // self.quant_config.group_size
-            scales_size2 = intermediate_size // self.quant_config.group_size
-        else:
-            scales_size13 = 1
-            scales_size2 = 1
-        # Fused gate_up_proj (column parallel)
-        w13_qweight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size // self.quant_config.pack_factor,
-            2 * intermediate_size,
-            dtype=torch.int32),
-                                         requires_grad=False)
-        layer.register_parameter("w13_qweight", w13_qweight)
-        set_weight_attrs(w13_qweight, extra_weight_attrs)
-        # down_proj (row parallel)
-        w2_qweight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            intermediate_size // self.quant_config.pack_factor,
-            hidden_size,
-            dtype=torch.int32),
-                                        requires_grad=False)
-        layer.register_parameter("w2_qweight", w2_qweight)
-        set_weight_attrs(w2_qweight, extra_weight_attrs)
-        # up_proj scales
-        w13_scales = torch.nn.Parameter(torch.empty(num_experts,
-                                                    scales_size13,
-                                                    2 * intermediate_size,
-                                                    dtype=params_dtype),
-                                        requires_grad=False)
-        layer.register_parameter("w13_scales", w13_scales)
-        set_weight_attrs(w13_scales, extra_weight_attrs)
-        # down_proj scales
-        w2_scales = torch.nn.Parameter(torch.empty(num_experts,
-                                                   scales_size2,
-                                                   hidden_size,
-                                                   dtype=params_dtype),
-                                       requires_grad=False)
-        layer.register_parameter("w2_scales", w2_scales)
-        set_weight_attrs(w2_scales, extra_weight_attrs)
-        w13_g_idx = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_g_idx", w13_g_idx)
-        set_weight_attrs(w13_g_idx, extra_weight_attrs)
-        w2_g_idx = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                intermediate_size,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_g_idx", w2_g_idx)
-        set_weight_attrs(w2_g_idx, extra_weight_attrs)
-        w13_g_idx_sort_indices = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_g_idx_sort_indices",
-                                 w13_g_idx_sort_indices)
-        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
-        w2_g_idx_sort_indices = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                intermediate_size,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_g_idx_sort_indices",
-                                 w2_g_idx_sort_indices)
-        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
-        layer.marlin_state = GPTQMarlinState.REPACK
-
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              router_logits: torch.Tensor,
-              top_k: int,
-              renormalize: bool = True,
-              use_grouped_topk: bool = False,
-              num_expert_group: Optional[int] = None,
-              topk_group: Optional[int] = None) -> torch.Tensor:
-        if layer.marlin_state == GPTQMarlinState.REPACK:
-            layer.marlin_state = GPTQMarlinState.READY
-
-            # Newly generated tensors need to replace existing tensors that are
-            # already registered as parameters by vLLM (and won't be freed)
-            def replace_tensor(name, new_t):
-                # It is important to use resize_() here since it ensures
-                # the same buffer is reused
-                getattr(layer, name).resize_(new_t.shape)
-                getattr(layer, name).copy_(new_t)
-                del new_t
-
-            def get_scale_perms(num_bits: int):
-                scale_perm: List[int] = []
-                for i in range(8):
-                    scale_perm.extend([i + 8 * j for j in range(8)])
-                scale_perm_single: List[int] = []
-                for i in range(4):
-                    scale_perm_single.extend(
-                        [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-                return scale_perm, scale_perm_single
-
-            def marlin_permute_scales(s: torch.Tensor, size_k: int,
-                                      size_n: int, group_size: int,
-                                      num_bits: int):
-                scale_perm, scale_perm_single = get_scale_perms(num_bits)
-                if group_size < size_k and group_size != -1:
-                    s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
-                else:
-                    s = s.reshape(
-                        (-1, len(scale_perm_single)))[:, scale_perm_single]
-                s = s.reshape((-1, size_n)).contiguous()
-                return s
-
-            def marlin_moe_permute_scales(s: torch.Tensor, size_k: int,
-                                          size_n: int, group_size: int,
-                                          num_bits: int):
-                num_experts = s.shape[0]
-                output = torch.empty((num_experts, s.shape[1], s.shape[2]),
-                                     device=s.device,
-                                     dtype=s.dtype)
-                for e in range(num_experts):
-                    output[e] = marlin_permute_scales(s[e], size_k, size_n,
-                                                      group_size, num_bits)
-                return output
-
-            # Process act_order
-            if self.quant_config.desc_act:
-                # Get sorting based on g_idx
-                num_experts = layer.w13_g_idx.shape[0]
-                w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
-                w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
-                w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
-                w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
-                for e in range(num_experts):
-                    w13_g_idx_sort_indices[e] = torch.argsort(
-                        layer.w13_g_idx[e]).to(torch.int32)
-                    w2_g_idx_sort_indices[e] = torch.argsort(
-                        layer.w2_g_idx[e]).to(torch.int32)
-                    w13_sorted_g_idx[e] = layer.w13_g_idx[e][
-                        w13_g_idx_sort_indices[e]]
-                    w2_sorted_g_idx[e] = layer.w2_g_idx[e][
-                        w2_g_idx_sort_indices[e]]
-                replace_tensor("w13_g_idx", w13_sorted_g_idx)
-                replace_tensor("w2_g_idx", w2_sorted_g_idx)
-                replace_tensor("w13_g_idx_sort_indices",
-                               w13_g_idx_sort_indices)
-                replace_tensor("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
-            else:
-                # Reset g_idx related tensors
-                num_experts = layer.w13_g_idx.shape[0]
-                device = layer.w13_g_idx.device
-                layer.w13_g_idx = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-                layer.w2_g_idx = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-                layer.w13_g_idx_sort_indices = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-                layer.w2_g_idx_sort_indices = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-            # Repack weights
-            marlin_w13_qweight = ops.gptq_marlin_moe_repack(
-                layer.w13_qweight,
-                layer.w13_g_idx_sort_indices,
-                layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
-                layer.w13_qweight.shape[2],
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w13_qweight", marlin_w13_qweight)
-            marlin_w2_qweight = ops.gptq_marlin_moe_repack(
-                layer.w2_qweight,
-                layer.w2_g_idx_sort_indices,
-                layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
-                layer.w2_qweight.shape[2],
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w2_qweight", marlin_w2_qweight)
-            # Repack scales
-            marlin_w13_scales = marlin_moe_permute_scales(
-                layer.w13_scales,
-                x.shape[1],
-                layer.w13_scales.shape[2],
-                self.quant_config.group_size,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w13_scales", marlin_w13_scales)
-            marlin_w2_scales = marlin_moe_permute_scales(
-                layer.w2_scales,
-                layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-                x.shape[1],
-                self.quant_config.group_size,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w2_scales", marlin_w2_scales)
-        return fused_marlin_moe(x,
-                                layer.w13_qweight,
-                                layer.w2_qweight,
-                                router_logits,
-                                layer.w13_g_idx,
-                                layer.w2_g_idx,
-                                layer.w13_g_idx_sort_indices,
-                                layer.w2_g_idx_sort_indices,
-                                top_k,
-                                renormalize=renormalize,
-                                w1_scale=layer.w13_scales,
-                                w2_scale=layer.w2_scales)
-
-
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
-    def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
-                       params_dtype: torch.dtype, **extra_weight_attrs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
         # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                    2 * intermediate_size,
-                                                    hidden_size,
-                                                    dtype=params_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        2 * intermediate_size,
+                        hidden_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
         # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                   hidden_size,
-                                                   intermediate_size,
-                                                   dtype=params_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        hidden_size,
+                        intermediate_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
@@ -332,9 +96,17 @@ def apply(
         num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
     ) -> torch.Tensor:
-        return self.forward(x, layer.w13_weight, layer.w2_weight,
-                            router_logits, top_k, renormalize,
-                            use_grouped_topk, num_expert_group, topk_group)
+        return self.forward(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            router_logits,
+            top_k,
+            renormalize,
+            use_grouped_topk,
+            num_expert_group,
+            topk_group,
+        )
 
     def forward_cuda(
         self,
@@ -349,16 +121,19 @@ def forward_cuda(
         topk_group: Optional[int],
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
-        return fused_moe(x,
-                         w1,
-                         w2,
-                         router_logits,
-                         top_k,
-                         renormalize=renormalize,
-                         inplace=True,
-                         use_grouped_topk=use_grouped_topk,
-                         num_expert_group=num_expert_group,
-                         topk_group=topk_group)
+
+        return fused_moe(
+            x,
+            w1,
+            w2,
+            router_logits,
+            top_k,
+            renormalize=renormalize,
+            inplace=True,
+            use_grouped_topk=use_grouped_topk,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+        )
 
     def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(
@@ -377,6 +152,7 @@ def forward_tpu(
         topk_group: Optional[int],
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
+
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
@@ -386,7 +162,7 @@ def forward_tpu(
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
-    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    This layer contains both MergedColumnParallel weights (gate_up_proj /
     w13) and RowParallelLinear weights (down_proj/ w2).
 
     Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
@@ -438,12 +214,9 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
 
-        self.quant_method: Optional[QuantizeMethodBase] = None
-
         if quant_config is None:
-            self.quant_method = UnquantizedFusedMoEMethod()
-        elif isinstance(quant_config, GPTQMarlinConfig):
-            self.quant_method = MarlinFusedMoEMethod(quant_config)
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedFusedMoEMethod()
         else:
             self.quant_method = quant_config.get_quant_method(self, prefix)
         assert self.quant_method is not None
@@ -454,15 +227,18 @@ def __init__(
             hidden_size=hidden_size,
             intermediate_size=self.intermediate_size_per_partition,
             params_dtype=params_dtype,
-            weight_loader=self.weight_loader)
-
-    def weight_loader(self,
-                      param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor,
-                      weight_name: str,
-                      shard_id: int,
-                      expert_id: int,
-                      is_quantized: bool = False):
+            weight_loader=self.weight_loader,
+        )
+
+    def weight_loader(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: int,
+        expert_id: int,
+        is_quantized: bool = False,
+    ):
         param_data = param.data
 
         if is_quantized:
@@ -491,8 +267,8 @@ def weight_loader(self,
         else:
             # Input scales can be loaded directly and should be equal.
             if "input_scale" in weight_name:
-                if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                                   loaded_weight).abs() > 1e-5:
+                if (param_data[expert_id] != 1 and
+                    (param_data[expert_id] - loaded_weight).abs() > 1e-5):
                     raise ValueError(
                         "input_scales of w1 and w3 of a layer "
                         f"must be equal. But got {param_data[expert_id]} "
@@ -546,7 +322,8 @@ def forward(self, hidden_states: torch.Tensor,
             renormalize=self.renormalize,
             use_grouped_topk=self.use_grouped_topk,
             num_expert_group=self.num_expert_group,
-            topk_group=self.topk_group)
+            topk_group=self.topk_group,
+        )
 
         if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -556,37 +333,70 @@ def forward(self, hidden_states: torch.Tensor,
 
     @classmethod
     def make_expert_params_mapping(
-            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
-            ckpt_up_proj_name: str,
-            num_experts: int) -> List[Tuple[str, str, int, int]]:
-
+        cls,
+        ckpt_gate_proj_name: str,
+        ckpt_down_proj_name: str,
+        ckpt_up_proj_name: str,
+        num_experts: int,
+    ) -> List[Tuple[str, str, int, int]]:
         gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
         gate_down_up = [
             ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
         ]
 
-        return [
+        return ([
             # These are the weight scales for the experts
             # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_scale"
-             if weight_name in gate_up else "experts.w2_scale",
-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
-             shard_id) for expert_id in range(num_experts)
+            (
+                "experts.w13_scale"
+                if weight_name in gate_up else "experts.w2_scale",
+                f"experts.{expert_id}.{weight_name}.weight_scale",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
             # These are the weights for the experts
             # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_weight"
-             if weight_name in gate_up else "experts.w2_weight",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
-            for expert_id in range(num_experts)
+            (
+                "experts.w13_weight"
+                if weight_name in gate_up else "experts.w2_weight",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
             # These are the weight scales for the experts
             # (param_name, weight_name, expert_id, shard_id)
-            ("experts.a13_scale"
-             if weight_name in gate_up else "experts.a2_scale",
-             f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
-             shard_id) for expert_id in range(num_experts)
+            (
+                "experts.a13_scale"
+                if weight_name in gate_up else "experts.a2_scale",
+                f"experts.{expert_id}.{weight_name}.input_scale",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
-        ]
+        ] + [
+            # These are the qweights for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                "experts.w13_qweight"
+                if weight_name in gate_up else "experts.w2_qweight",
+                f"experts.{expert_id}.{weight_name}.qweight",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the g_idx and g_idx_sort_indices scales for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                "experts.w13_g_idx"
+                if weight_name in gate_up else "experts.w2_g_idx",
+                f"experts.{expert_id}.{weight_name}.g_idx",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ])
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index bdcc9c3b4f0c5..f58a89c8e4bb9 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,29 +1,53 @@
-from typing import Any, Dict, List, Optional
-
+from typing import Any, Dict, List, Optional, Union
+import enum
+from enum import Enum
 import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    set_weight_attrs,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe.fused_moe_gptq import fused_moe_gptq
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_gptq_marlin_linear, check_gptq_marlin_supported, marlin_is_k_full,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
-    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_gptq_marlin_supported, verify_marlin_supports_shape)
+    apply_gptq_marlin_linear,
+    check_gptq_marlin_supported,
+    marlin_is_k_full,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace,
+    marlin_permute_scales,
+    marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx,
+    replace_tensor,
+    verify_gptq_marlin_supported,
+    verify_marlin_supports_shape,
+)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 
 logger = init_logger(__name__)
 
 
+class GPTQMarlinState(Enum):
+    REPACK = enum.auto()
+    READY = enum.auto()
+
+
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+    ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -95,11 +119,14 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQMarlinLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return GPTQMarlinMoEMethod(self)
         return None
 
     def get_scaled_act_names(self) -> List[str]:
@@ -118,15 +145,15 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
             return False
 
         # If we cannot find the info needed in the config, cannot convert.
-        if (num_bits is None or group_size is None or sym is None
-                or desc_act is None):
+        if num_bits is None or group_size is None or sym is None or desc_act is None:
             return False
 
         return check_gptq_marlin_supported(
             num_bits=num_bits,
             group_size=group_size,
             is_sym=sym,
-            min_capability=cls.get_min_capability())
+            min_capability=cls.get_min_capability(),
+        )
 
 
 class GPTQMarlinLinearMethod(LinearMethodBase):
@@ -163,7 +190,8 @@ def create_weights(
             output_size_per_partition=output_size_per_partition,
             input_size_per_partition=input_size_per_partition,
             input_size=input_size,
-            group_size=group_size)
+            group_size=group_size,
+        )
 
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
@@ -293,7 +321,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.weight_bits,
+        )
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -302,7 +331,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=(layer.input_size if self.quant_config.desc_act else
                     layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
-            group_size=self.quant_config.group_size)
+            group_size=self.quant_config.group_size,
+        )
         replace_tensor(layer, "scales", marlin_scales)
 
     def apply(
@@ -323,4 +353,284 @@ def apply(
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,
-            bias=bias)
+            bias=bias,
+        )
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Currently assuming is_k_full is always True
+        # (input size per partition is the same as full input size)
+        # Supports only sym for now (no zp)
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            scales_size2 = intermediate_size // self.quant_config.group_size
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+        layer.marlin_state = GPTQMarlinState.REPACK
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+    ) -> torch.Tensor:
+        if layer.marlin_state == GPTQMarlinState.REPACK:
+            layer.marlin_state = GPTQMarlinState.READY
+
+            # Newly generated tensors need to replace existing tensors that are
+            # already registered as parameters by vLLM (and won't be freed)
+            def replace_tensor(name, new_t):
+                # It is important to use resize_() here since it ensures
+                # the same buffer is reused
+                getattr(layer, name).resize_(new_t.shape)
+                getattr(layer, name).copy_(new_t)
+                del new_t
+
+            def get_scale_perms(num_bits: int):
+                scale_perm: List[int] = []
+                for i in range(8):
+                    scale_perm.extend([i + 8 * j for j in range(8)])
+                scale_perm_single: List[int] = []
+                for i in range(4):
+                    scale_perm_single.extend(
+                        [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+                return scale_perm, scale_perm_single
+
+            def marlin_permute_scales(
+                s: torch.Tensor,
+                size_k: int,
+                size_n: int,
+                group_size: int,
+                num_bits: int,
+            ):
+                scale_perm, scale_perm_single = get_scale_perms(num_bits)
+                if group_size < size_k and group_size != -1:
+                    s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+                else:
+                    s = s.reshape(
+                        (-1, len(scale_perm_single)))[:, scale_perm_single]
+                s = s.reshape((-1, size_n)).contiguous()
+                return s
+
+            def marlin_moe_permute_scales(
+                s: torch.Tensor,
+                size_k: int,
+                size_n: int,
+                group_size: int,
+                num_bits: int,
+            ):
+                num_experts = s.shape[0]
+                output = torch.empty(
+                    (num_experts, s.shape[1], s.shape[2]),
+                    device=s.device,
+                    dtype=s.dtype,
+                )
+                for e in range(num_experts):
+                    output[e] = marlin_permute_scales(s[e], size_k, size_n,
+                                                      group_size, num_bits)
+                return output
+
+            # Process act_order
+            if self.quant_config.desc_act:
+                # Get sorting based on g_idx
+                num_experts = layer.w13_g_idx.shape[0]
+                w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+                w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+                w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+                w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+                for e in range(num_experts):
+                    w13_g_idx_sort_indices[e] = torch.argsort(
+                        layer.w13_g_idx[e]).to(torch.int32)
+                    w2_g_idx_sort_indices[e] = torch.argsort(
+                        layer.w2_g_idx[e]).to(torch.int32)
+                    w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                        w13_g_idx_sort_indices[e]]
+                    w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                        w2_g_idx_sort_indices[e]]
+                replace_tensor("w13_g_idx", w13_sorted_g_idx)
+                replace_tensor("w2_g_idx", w2_sorted_g_idx)
+                replace_tensor("w13_g_idx_sort_indices",
+                               w13_g_idx_sort_indices)
+                replace_tensor("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+            else:
+                # Reset g_idx related tensors
+                num_experts = layer.w13_g_idx.shape[0]
+                device = layer.w13_g_idx.device
+                layer.w13_g_idx = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+                layer.w2_g_idx = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+                layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+                layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                    torch.empty((num_experts, 0),
+                                dtype=torch.int32,
+                                device=device),
+                    requires_grad=False,
+                )
+            # Repack weights
+            marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+                layer.w13_qweight,
+                layer.w13_g_idx_sort_indices,
+                layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+                layer.w13_qweight.shape[2],
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w13_qweight", marlin_w13_qweight)
+            marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+                layer.w2_qweight,
+                layer.w2_g_idx_sort_indices,
+                layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+                layer.w2_qweight.shape[2],
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w2_qweight", marlin_w2_qweight)
+            # Repack scales
+            marlin_w13_scales = marlin_moe_permute_scales(
+                layer.w13_scales,
+                x.shape[1],
+                layer.w13_scales.shape[2],
+                self.quant_config.group_size,
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w13_scales", marlin_w13_scales)
+            marlin_w2_scales = marlin_moe_permute_scales(
+                layer.w2_scales,
+                layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+                x.shape[1],
+                self.quant_config.group_size,
+                self.quant_config.weight_bits,
+            )
+            replace_tensor("w2_scales", marlin_w2_scales)
+        return fused_moe_gptq(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            top_k,
+            renormalize=renormalize,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+        )
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 85dafd55bbcf8..cdfd24874b974 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -21,7 +21,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-import re
 from typing import Iterable, List, Optional, Tuple
 
 import numpy as np
@@ -35,7 +34,6 @@
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
@@ -96,13 +94,10 @@ class MixtralMoE(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        use_fused_moe: bool,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.config = config
-        self.use_fused_moe = use_fused_moe
-        self.quant_config = quant_config
         self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_total_experts = config.num_local_experts
@@ -118,26 +113,14 @@ def __init__(
             raise ValueError(
                 f"Rank {self.rank} has no experts assigned to it.")
 
-        if self.use_fused_moe:
-            params_dtype = torch.float16
-            self.experts = FusedMoE(num_experts=self.num_total_experts,
-                                    top_k=self.top_k,
-                                    hidden_size=config.hidden_size,
-                                    intermediate_size=config.intermediate_size,
-                                    params_dtype=params_dtype,
-                                    reduce_results=True,
-                                    renormalize=True,
-                                    quant_config=quant_config,
-                                    tp_size=self.tp_size)
-        else:
-            self.experts = nn.ModuleList([
-                MixtralMLP(self.num_total_experts,
-                           config.hidden_size,
-                           config.intermediate_size,
-                           quant_config=quant_config)
-                if idx in self.expert_indicies else None
-                for idx in range(self.num_total_experts)
-            ])
+        self.experts = nn.ModuleList([
+            MixtralMLP(self.num_total_experts,
+                       config.hidden_size,
+                       config.intermediate_size,
+                       quant_config=quant_config)
+            if idx in self.expert_indicies else None
+            for idx in range(self.num_total_experts)
+        ])
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      self.num_total_experts,
@@ -149,34 +132,28 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits, _ = self.gate(hidden_states)
 
-        if self.use_fused_moe:
-            ret = self.experts(hidden_states.half(), router_logits)
-            return ret.bfloat16()
-        else:
-            routing_weights = F.softmax(router_logits,
-                                        dim=1,
-                                        dtype=torch.float)
-            routing_weights, selected_experts = torch.topk(routing_weights,
-                                                           self.top_k,
-                                                           dim=-1)
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-            final_hidden_states = None
-            for expert_idx in self.expert_indicies:
-                expert_layer = self.experts[expert_idx]
-                expert_mask = (selected_experts == expert_idx)
-                expert_weights = (routing_weights * expert_mask).sum(
-                    dim=-1, keepdim=True)
-
-                current_hidden_states = expert_layer(hidden_states).mul_(
-                    expert_weights)
-                if final_hidden_states is None:
-                    final_hidden_states = current_hidden_states
-                else:
-                    final_hidden_states.add_(current_hidden_states)
-
-            return tensor_model_parallel_all_reduce(final_hidden_states).view(
-                num_tokens, hidden_dim)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights,
+                                                       self.top_k,
+                                                       dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        final_hidden_states = None
+        for expert_idx in self.expert_indicies:
+            expert_layer = self.experts[expert_idx]
+            expert_mask = (selected_experts == expert_idx)
+            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
+                                                                 keepdim=True)
+
+            current_hidden_states = expert_layer(hidden_states).mul_(
+                expert_weights)
+            if final_hidden_states is None:
+                final_hidden_states = current_hidden_states
+            else:
+                final_hidden_states.add_(current_hidden_states)
+
+        return tensor_model_parallel_all_reduce(final_hidden_states).view(
+            num_tokens, hidden_dim)
 
 
 class MixtralAttention(nn.Module):
@@ -261,7 +238,6 @@ class MixtralDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -278,7 +254,6 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config)
         self.block_sparse_moe = MixtralMoE(config=config,
-                                           use_fused_moe=use_fused_moe,
                                            quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -319,7 +294,6 @@ class MixtralModel(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -333,7 +307,6 @@ def __init__(
         )
         self.layers = nn.ModuleList([
             MixtralDecoderLayer(config,
-                                use_fused_moe,
                                 cache_config,
                                 quant_config=quant_config)
             for _ in range(config.num_hidden_layers)
@@ -370,12 +343,10 @@ def __init__(
         super().__init__()
 
         # TODO check runs with dtype=float16
-        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
 
         self.config = config
         self.quant_config = quant_config
-        self.model = MixtralModel(config, self.use_fused_moe, cache_config,
-                                  quant_config)
+        self.model = MixtralModel(config, cache_config, quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -436,50 +407,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
 
-                if self.use_fused_moe:
-                    if ("block_sparse_moe.experts." in name
-                            and ".w1." not in name and ".w2." not in name
-                            and ".w3." not in name
-                            and name not in params_dict):
-                        continue
-
-                    if (".qzeros" in name):
-                        continue
-
-                    shard_id = None
-                    expert_id = 0
-
-                    has_any_numbered = (".qweight" in name or ".scales" in name
-                                        or ".g_idx" in name)
-                    if (has_any_numbered and (".w1." in name)):
-                        name = name.replace(".w1.", ".w13_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w2." in name)):
-                        name = name.replace(".w2.", ".w2_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w3." in name)):
-                        name = name.replace(".w3.", ".w13_")
-                        shard_id = 1
-
-                    exp_string = re.search(r"\.experts\.\d+.", name)
-                    if exp_string:
-                        exp_string = exp_string.group(0)
-                        expert_id = int(exp_string.split(".")[2])
-                        name = name.replace(exp_string, ".experts.")
-
-                else:
-                    if ("block_sparse_moe.experts." in name
-                            and name not in params_dict):
-                        continue
-
-                param = params_dict[name]
-
-                if self.use_fused_moe and shard_id is not None:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight, name, shard_id,
-                                  expert_id, True)
-                else:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
+                if ("block_sparse_moe.experts." in name
+                        and name not in params_dict):
+                    continue
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From 7da678eb61bcd50c0b51d30d899e319d9e255d5a Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 6 Aug 2024 18:18:26 -0700
Subject: [PATCH 005/106] Fixing tests

---
 tests/kernels/test_moe.py                     | 86 ++++++++++---------
 .../layers/fused_moe/fused_moe_gptq.py        | 26 ++++--
 2 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index e73e5a518ef1a..d9480c8cf882e 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -10,10 +10,10 @@
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import (fused_marlin_moe, fused_moe,
-                                                  single_marlin_moe)
+from vllm.model_executor.layers.fused_moe import fused_moe, single_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_gptq import fused_moe_gptq
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    marlin_quantize)
+    marlin_quantize, )
 from vllm.model_executor.models.mixtral import MixtralMoE
 
 
@@ -62,11 +62,11 @@ def test_fused_moe(
     topk: int,
     dtype: torch.dtype,
 ):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
@@ -114,10 +114,12 @@ def test_mixtral_moe(dtype: torch.dtype):
         torch.bfloat16: 1e-2,
     }
 
-    assert torch.allclose(hf_states.flatten(0, 1),
-                          vllm_states,
-                          rtol=mixtral_moe_tol[dtype],
-                          atol=mixtral_moe_tol[dtype])
+    assert torch.allclose(
+        hf_states.flatten(0, 1),
+        vllm_states,
+        rtol=mixtral_moe_tol[dtype],
+        atol=mixtral_moe_tol[dtype],
+    )
 
 
 def stack_and_dev(tensors: List[torch.Tensor]):
@@ -165,11 +167,11 @@ def test_fused_marlin_moe(
 
     num_bits = 4
     dtype = torch.float16
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
     for i in range(w2.shape[0]):
-        w2[0] = torch.eye(k, n, device='cuda', dtype=dtype)
+        w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
 
     w_ref1_l = []
     qweight1_l = []
@@ -215,27 +217,31 @@ def test_fused_marlin_moe(
     g_idx2 = stack_and_dev(g_idx2_l)
     sort_indices2 = stack_and_dev(sort_indices2_l)
 
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
-    triton_output = fused_moe(a,
-                              w_ref1.transpose(1, 2).contiguous(),
-                              w_ref2.transpose(1, 2).contiguous(),
-                              score,
-                              topk,
-                              renormalize=False)
-    marlin_output = fused_marlin_moe(a,
-                                     qweight1,
-                                     qweight2,
-                                     score,
-                                     g_idx1,
-                                     g_idx2,
-                                     sort_indices1,
-                                     sort_indices2,
-                                     topk,
-                                     renormalize=False,
-                                     w1_scale=scales1,
-                                     w2_scale=scales2)
-
-    assert (compute_max_diff(marlin_output, triton_output) < 4e-2)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    triton_output = fused_moe(
+        a,
+        w_ref1.transpose(1, 2).contiguous(),
+        w_ref2.transpose(1, 2).contiguous(),
+        score,
+        topk,
+        renormalize=False,
+    )
+    marlin_output = fused_moe_gptq(
+        a,
+        qweight1,
+        qweight2,
+        score,
+        g_idx1,
+        g_idx2,
+        sort_indices1,
+        sort_indices2,
+        topk,
+        renormalize=False,
+        w1_scale=scales1,
+        w2_scale=scales2,
+    )
+
+    assert compute_max_diff(marlin_output, triton_output) < 4e-2
 
 
 # TODO: make sure this test works
@@ -272,8 +278,8 @@ def test_single_marlin_moe(
 
     num_bits = 4
     dtype = torch.float16
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w = torch.randn((e, n, k), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
 
     w_ref_l = []
     qweights_l = []
@@ -297,7 +303,7 @@ def test_single_marlin_moe(
     g_idx = stack_and_dev(g_idx_l)
     sort_indices = stack_and_dev(sort_indices_l)
 
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
     marlin_output = single_marlin_moe(a,
                                       qweight,
                                       scales,
@@ -308,4 +314,4 @@ def test_single_marlin_moe(
                                       renormalize=False)
     torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
 
-    assert (compute_max_diff(marlin_output, torch_output) < 1e-2)
+    assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py b/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py
index 15c11fc0b668e..e7c47f14f85d4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py
@@ -51,19 +51,25 @@ def fused_moe_gptq(
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
     # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
-    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, renormalize)
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
@@ -81,9 +87,10 @@ def fused_moe_gptq(
     sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
     max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(
-        max_workspace_size, dtype=torch.int, device="cuda", requires_grad=False
-    )
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
 
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
@@ -135,4 +142,5 @@ def fused_moe_gptq(
         True,
     )
 
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)

From 641696b8608843e12c5852dd33c7c6322ba5d297 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 8 Aug 2024 10:16:19 -0700
Subject: [PATCH 006/106] Addressing repacking comment

---
 .../layers/quantization/gptq_marlin.py        | 221 +++++++-----------
 .../layers/quantization/utils/marlin_utils.py |  17 ++
 2 files changed, 98 insertions(+), 140 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index f58a89c8e4bb9..90ffbd4360f9b 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -21,6 +21,7 @@
     marlin_make_empty_g_idx,
     marlin_make_workspace,
     marlin_permute_scales,
+    marlin_moe_permute_scales,
     marlin_repeat_scales_on_all_ranks,
     marlin_sort_g_idx,
     replace_tensor,
@@ -469,6 +470,86 @@ def create_weights(
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
         layer.marlin_state = GPTQMarlinState.REPACK
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.marlin_state = GPTQMarlinState.READY
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+            replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_tensor(layer, "w13_g_idx_sort_indices",
+                           w13_g_idx_sort_indices)
+            replace_tensor(layer, "w2_g_idx_sort_indices",
+                           w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.weight_bits,
+        )
+        replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_qweight, layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2])
+        replace_tensor(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=(layer.input_size if self.quant_config.desc_act else
+                    layer.input_size_per_partition),
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size)
+        replace_tensor(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_n=(layer.input_size if self.quant_config.desc_act else
+                    layer.input_size_per_partition),
+            group_size=self.quant_config.group_size)
+        replace_tensor(layer, "w2_scales", marlin_w2_scales)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -480,146 +561,6 @@ def apply(
         num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
     ) -> torch.Tensor:
-        if layer.marlin_state == GPTQMarlinState.REPACK:
-            layer.marlin_state = GPTQMarlinState.READY
-
-            # Newly generated tensors need to replace existing tensors that are
-            # already registered as parameters by vLLM (and won't be freed)
-            def replace_tensor(name, new_t):
-                # It is important to use resize_() here since it ensures
-                # the same buffer is reused
-                getattr(layer, name).resize_(new_t.shape)
-                getattr(layer, name).copy_(new_t)
-                del new_t
-
-            def get_scale_perms(num_bits: int):
-                scale_perm: List[int] = []
-                for i in range(8):
-                    scale_perm.extend([i + 8 * j for j in range(8)])
-                scale_perm_single: List[int] = []
-                for i in range(4):
-                    scale_perm_single.extend(
-                        [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-                return scale_perm, scale_perm_single
-
-            def marlin_permute_scales(
-                s: torch.Tensor,
-                size_k: int,
-                size_n: int,
-                group_size: int,
-                num_bits: int,
-            ):
-                scale_perm, scale_perm_single = get_scale_perms(num_bits)
-                if group_size < size_k and group_size != -1:
-                    s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
-                else:
-                    s = s.reshape(
-                        (-1, len(scale_perm_single)))[:, scale_perm_single]
-                s = s.reshape((-1, size_n)).contiguous()
-                return s
-
-            def marlin_moe_permute_scales(
-                s: torch.Tensor,
-                size_k: int,
-                size_n: int,
-                group_size: int,
-                num_bits: int,
-            ):
-                num_experts = s.shape[0]
-                output = torch.empty(
-                    (num_experts, s.shape[1], s.shape[2]),
-                    device=s.device,
-                    dtype=s.dtype,
-                )
-                for e in range(num_experts):
-                    output[e] = marlin_permute_scales(s[e], size_k, size_n,
-                                                      group_size, num_bits)
-                return output
-
-            # Process act_order
-            if self.quant_config.desc_act:
-                # Get sorting based on g_idx
-                num_experts = layer.w13_g_idx.shape[0]
-                w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
-                w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
-                w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
-                w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
-                for e in range(num_experts):
-                    w13_g_idx_sort_indices[e] = torch.argsort(
-                        layer.w13_g_idx[e]).to(torch.int32)
-                    w2_g_idx_sort_indices[e] = torch.argsort(
-                        layer.w2_g_idx[e]).to(torch.int32)
-                    w13_sorted_g_idx[e] = layer.w13_g_idx[e][
-                        w13_g_idx_sort_indices[e]]
-                    w2_sorted_g_idx[e] = layer.w2_g_idx[e][
-                        w2_g_idx_sort_indices[e]]
-                replace_tensor("w13_g_idx", w13_sorted_g_idx)
-                replace_tensor("w2_g_idx", w2_sorted_g_idx)
-                replace_tensor("w13_g_idx_sort_indices",
-                               w13_g_idx_sort_indices)
-                replace_tensor("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
-            else:
-                # Reset g_idx related tensors
-                num_experts = layer.w13_g_idx.shape[0]
-                device = layer.w13_g_idx.device
-                layer.w13_g_idx = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-                layer.w2_g_idx = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-                layer.w13_g_idx_sort_indices = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-                layer.w2_g_idx_sort_indices = torch.nn.Parameter(
-                    torch.empty((num_experts, 0),
-                                dtype=torch.int32,
-                                device=device),
-                    requires_grad=False,
-                )
-            # Repack weights
-            marlin_w13_qweight = ops.gptq_marlin_moe_repack(
-                layer.w13_qweight,
-                layer.w13_g_idx_sort_indices,
-                layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
-                layer.w13_qweight.shape[2],
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w13_qweight", marlin_w13_qweight)
-            marlin_w2_qweight = ops.gptq_marlin_moe_repack(
-                layer.w2_qweight,
-                layer.w2_g_idx_sort_indices,
-                layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
-                layer.w2_qweight.shape[2],
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w2_qweight", marlin_w2_qweight)
-            # Repack scales
-            marlin_w13_scales = marlin_moe_permute_scales(
-                layer.w13_scales,
-                x.shape[1],
-                layer.w13_scales.shape[2],
-                self.quant_config.group_size,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w13_scales", marlin_w13_scales)
-            marlin_w2_scales = marlin_moe_permute_scales(
-                layer.w2_scales,
-                layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-                x.shape[1],
-                self.quant_config.group_size,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("w2_scales", marlin_w2_scales)
         return fused_moe_gptq(
             x,
             layer.w13_qweight,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index b789ca20cadb3..610650a744986 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -181,6 +181,23 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
     return s
 
 
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
 def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the

From 3cef6678e7d0ee54d05cd95b9e91b8f691bed8a8 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 8 Aug 2024 10:20:13 -0700
Subject: [PATCH 007/106] gptq -> marlin renaming

---
 tests/kernels/test_moe.py                     |  46 +++---
 .../layers/fused_moe/__init__.py              |   4 +-
 ...{fused_moe_gptq.py => fused_moe_marlin.py} |  28 ++--
 .../layers/quantization/gptq_marlin.py        | 155 +++++++++---------
 4 files changed, 113 insertions(+), 120 deletions(-)
 rename vllm/model_executor/layers/fused_moe/{fused_moe_gptq.py => fused_moe_marlin.py} (84%)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index d9480c8cf882e..856ee7c56e598 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -11,9 +11,10 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe, single_marlin_moe
-from vllm.model_executor.layers.fused_moe.fused_moe_gptq import fused_moe_gptq
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    marlin_quantize, )
+    marlin_quantize,
+)
 from vllm.model_executor.models.mixtral import MixtralMoE
 
 
@@ -28,10 +29,12 @@ def torch_moe(a, w1, w2, score, topk):
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            out[mask] = SiluAndMul()(
-                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 def torch_moe_single(a, w, score, topk):
@@ -72,8 +75,7 @@ def test_fused_moe(
     assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
 
 
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
 def test_mixtral_moe(dtype: torch.dtype):
     """Make sure our Mixtral MoE implementation agrees with the one from
@@ -94,8 +96,7 @@ def test_mixtral_moe(dtype: torch.dtype):
     # Load the weights
     vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
     for i in range(config.num_local_experts):
-        weights = (hf_moe.experts[i].w1.weight.data,
-                   hf_moe.experts[i].w3.weight.data)
+        weights = (hf_moe.experts[i].w1.weight.data, hf_moe.experts[i].w3.weight.data)
         vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
         vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
 
@@ -129,7 +130,8 @@ def stack_and_dev(tensors: List[torch.Tensor]):
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+        torch.abs(output_ref)
+    )
 
 
 # TODO: make sure this test works
@@ -182,7 +184,8 @@ def test_fused_marlin_moe(
     for i in range(w1.shape[0]):
         test_perm = torch.randperm(k)
         w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
-            w1[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
+            w1[i].transpose(1, 0), num_bits, group_size, act_order, test_perm
+        )
         w_ref1_l.append(w_ref1)
         qweight1_l.append(qweight1)
         scales1_l.append(scales1)
@@ -204,7 +207,8 @@ def test_fused_marlin_moe(
     for i in range(w2.shape[0]):
         test_perm = torch.randperm(n)
         w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
-            w2[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
+            w2[i].transpose(1, 0), num_bits, group_size, act_order, test_perm
+        )
         w_ref2_l.append(w_ref2)
         qweight2_l.append(qweight2)
         scales2_l.append(scales2)
@@ -226,7 +230,7 @@ def test_fused_marlin_moe(
         topk,
         renormalize=False,
     )
-    marlin_output = fused_moe_gptq(
+    marlin_output = fused_moe_marlin(
         a,
         qweight1,
         qweight2,
@@ -290,7 +294,8 @@ def test_single_marlin_moe(
     for i in range(w.shape[0]):
         test_perm = torch.randperm(k)
         w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-            w[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
+            w[i].transpose(1, 0), num_bits, group_size, act_order, test_perm
+        )
         w_ref_l.append(w_ref)
         qweights_l.append(qweight)
         scales_l.append(scales)
@@ -304,14 +309,9 @@ def test_single_marlin_moe(
     sort_indices = stack_and_dev(sort_indices_l)
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-    marlin_output = single_marlin_moe(a,
-                                      qweight,
-                                      scales,
-                                      score,
-                                      g_idx,
-                                      sort_indices,
-                                      topk,
-                                      renormalize=False)
+    marlin_output = single_marlin_moe(
+        a, qweight, scales, score, g_idx, sort_indices, topk, renormalize=False
+    )
     torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
 
     assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 2b982b7ab9f86..beb94f10a557e 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,4 +1,4 @@
-from vllm.model_executor.layers.fused_moe.fused_moe_gptq import fused_moe_gptq
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
 from vllm.model_executor.layers.fused_moe.fused_moe import single_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
 from vllm.triton_utils import HAS_TRITON
@@ -6,7 +6,7 @@
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
-    "fused_moe_gptq",
+    "fused_moe_marlin",
     "single_marlin_moe",
 ]
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
similarity index 84%
rename from vllm/model_executor/layers/fused_moe/fused_moe_gptq.py
rename to vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index e7c47f14f85d4..4ffcda6f85d5e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_gptq.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -8,7 +8,7 @@
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 
 
-def fused_moe_gptq(
+def fused_moe_marlin(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
@@ -51,25 +51,19 @@ def fused_moe_gptq(
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
     # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[
-        0], "Number of tokens mismatch"
-    assert hidden_states.shape[
-        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // 2, "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
+    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, renormalize)
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
@@ -87,10 +81,9 @@ def fused_moe_gptq(
     sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
     max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device="cuda",
-                            requires_grad=False)
+    workspace = torch.zeros(
+        max_workspace_size, dtype=torch.int, device="cuda", requires_grad=False
+    )
 
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
@@ -142,5 +135,4 @@ def fused_moe_gptq(
         True,
     )
 
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 90ffbd4360f9b..c81517823cbf2 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -12,7 +12,7 @@
     set_weight_attrs,
 )
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
-from vllm.model_executor.layers.fused_moe.fused_moe_gptq import fused_moe_gptq
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear,
@@ -62,15 +62,17 @@ def __init__(
         self.lm_head_quantized = lm_head_quantized
 
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self.weight_bits,
-                                     group_size=self.group_size,
-                                     is_sym=self.is_sym)
+        verify_gptq_marlin_supported(
+            num_bits=self.weight_bits, group_size=self.group_size, is_sym=self.is_sym
+        )
 
     def __repr__(self) -> str:
-        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act}, "
-                f"lm_head_quantized={self.lm_head_quantized})")
+        return (
+            f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}, "
+            f"lm_head_quantized={self.lm_head_quantized})"
+        )
 
     @classmethod
     def get_name(cls) -> str:
@@ -94,37 +96,40 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         is_sym = cls.get_from_keys(config, ["sym"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
-        return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, lm_head_quantized)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
-                               or user_quant == "gptq_marlin")
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin"
+        )
 
         if can_convert and is_valid_user_quant:
-            msg = ("The model is convertible to {} during runtime."
-                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
             logger.info(msg)
             return cls.get_name()
 
         if can_convert and user_quant == "gptq":
-            logger.info("Detected that the model can run with gptq_marlin"
-                        ", however you specified quantization=gptq explicitly,"
-                        " so forcing gptq. Use quantization=gptq_marlin for"
-                        " faster inference")
+            logger.info(
+                "Detected that the model can run with gptq_marlin"
+                ", however you specified quantization=gptq explicitly,"
+                " so forcing gptq. Use quantization=gptq_marlin for"
+                " faster inference"
+            )
         return None
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
-        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
-                                             and self.lm_head_quantized):
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
             return GPTQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
@@ -195,9 +200,9 @@ def create_weights(
         )
 
         # Determine sharding
-        if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
-                                             self.quant_config.group_size,
-                                             is_row_parallel):
+        if marlin_repeat_scales_on_all_ranks(
+            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
+        ):
             # By setting scale_dim == None, weight_loader will
             # repeat the scales on each GPU in TP>1 case.
             scales_and_zp_input_dim = None
@@ -239,10 +244,7 @@ def create_weights(
         # Ignore warning from fused linear layers such as QKVParallelLinear.
         set_weight_attrs(
             g_idx,
-            {
-                **extra_weight_attrs, "input_dim": 0,
-                "ignore_warning": True
-            },
+            {**extra_weight_attrs, "input_dim": 0, "ignore_warning": True},
         )
 
         # Scales
@@ -291,8 +293,7 @@ def create_weights(
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
-        layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act,
-                                           is_row_parallel)
+        layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act, is_row_parallel)
 
     # Checkpoints are serialized in AutoGPTQ format, which is different from the
     # marlin format. This function is called after the weights are loaded.
@@ -301,8 +302,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         device = layer.qweight.device
 
         # Allocate marlin workspace
-        layer.workspace = marlin_make_workspace(
-            layer.output_size_per_partition, device)
+        layer.workspace = marlin_make_workspace(layer.output_size_per_partition, device)
 
         # Handle sorting for activation reordering if needed.
         if self.quant_config.desc_act:
@@ -329,8 +329,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Permute scales from autogptq format to marlin format.
         marlin_scales = marlin_permute_scales(
             layer.scales,
-            size_k=(layer.input_size if self.quant_config.desc_act else
-                    layer.input_size_per_partition),
+            size_k=(
+                layer.input_size
+                if self.quant_config.desc_act
+                else layer.input_size_per_partition
+            ),
             size_n=layer.output_size_per_partition,
             group_size=self.quant_config.group_size,
         )
@@ -408,20 +411,16 @@ def create_weights(
         set_weight_attrs(w2_qweight, extra_weight_attrs)
         # up_proj scales
         w13_scales = torch.nn.Parameter(
-            torch.empty(num_experts,
-                        scales_size13,
-                        2 * intermediate_size,
-                        dtype=params_dtype),
+            torch.empty(
+                num_experts, scales_size13, 2 * intermediate_size, dtype=params_dtype
+            ),
             requires_grad=False,
         )
         layer.register_parameter("w13_scales", w13_scales)
         set_weight_attrs(w13_scales, extra_weight_attrs)
         # down_proj scales
         w2_scales = torch.nn.Parameter(
-            torch.empty(num_experts,
-                        scales_size2,
-                        hidden_size,
-                        dtype=params_dtype),
+            torch.empty(num_experts, scales_size2, hidden_size, dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w2_scales", w2_scales)
@@ -454,8 +453,7 @@ def create_weights(
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w13_g_idx_sort_indices",
-                                 w13_g_idx_sort_indices)
+        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
         set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
         w2_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty(
@@ -465,8 +463,7 @@ def create_weights(
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w2_g_idx_sort_indices",
-                                 w2_g_idx_sort_indices)
+        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
         layer.marlin_state = GPTQMarlinState.REPACK
 
@@ -482,42 +479,36 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
             w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
             for e in range(num_experts):
-                w13_g_idx_sort_indices[e] = torch.argsort(
-                    layer.w13_g_idx[e]).to(torch.int32)
+                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_g_idx[e]).to(
+                    torch.int32
+                )
                 w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
-                    torch.int32)
-                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
-                    w13_g_idx_sort_indices[e]]
-                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
-                    w2_g_idx_sort_indices[e]]
+                    torch.int32
+                )
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][w2_g_idx_sort_indices[e]]
             replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx)
             replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx)
-            replace_tensor(layer, "w13_g_idx_sort_indices",
-                           w13_g_idx_sort_indices)
-            replace_tensor(layer, "w2_g_idx_sort_indices",
-                           w2_g_idx_sort_indices)
+            replace_tensor(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+            replace_tensor(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
         else:
             # Reset g_idx related tensors
             num_experts = layer.w13_g_idx.shape[0]
             device = layer.w13_g_idx.device
             layer.w13_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w2_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w13_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
             layer.w2_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32,
-                            device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                 requires_grad=False,
             )
         # Repack weights
@@ -530,24 +521,34 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
         replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
-            layer.w2_qweight, layer.w2_g_idx_sort_indices,
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
             layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
-            layer.w2_qweight.shape[2])
+            layer.w2_qweight.shape[2],
+        )
         replace_tensor(layer, "w2_qweight", marlin_w2_qweight)
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
-            size_k=(layer.input_size if self.quant_config.desc_act else
-                    layer.input_size_per_partition),
+            size_k=(
+                layer.input_size
+                if self.quant_config.desc_act
+                else layer.input_size_per_partition
+            ),
             size_n=layer.w13_scales.shape[2],
-            group_size=self.quant_config.group_size)
+            group_size=self.quant_config.group_size,
+        )
         replace_tensor(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-            size_n=(layer.input_size if self.quant_config.desc_act else
-                    layer.input_size_per_partition),
-            group_size=self.quant_config.group_size)
+            size_n=(
+                layer.input_size
+                if self.quant_config.desc_act
+                else layer.input_size_per_partition
+            ),
+            group_size=self.quant_config.group_size,
+        )
         replace_tensor(layer, "w2_scales", marlin_w2_scales)
 
     def apply(
@@ -561,7 +562,7 @@ def apply(
         num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
     ) -> torch.Tensor:
-        return fused_moe_gptq(
+        return fused_moe_marlin(
             x,
             layer.w13_qweight,
             layer.w2_qweight,

From a6710af0ab3bd8e1bf030d8a4f1ff14eb9afab37 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 8 Aug 2024 10:46:20 -0700
Subject: [PATCH 008/106] Undo formatting changes

---
 vllm/model_executor/layers/fused_moe/layer.py | 31 ++++++-------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 913d6a93b0cd5..566a0a4f23be6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -54,15 +54,9 @@ def apply(
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                    hidden_size: int, intermediate_size: int,
+                    params_dtype: torch.dtype, **extra_weight_attrs):
         # Fused gate_up_proj (column parallel)
         w13_weight = torch.nn.Parameter(
             torch.empty(num_experts,
@@ -158,11 +152,10 @@ def forward_tpu(
         assert topk_group is None
         return fused_moe(x, w1, w2, router_logits, top_k, renormalize)
 
-
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
-    This layer contains both MergedColumnParallel weights (gate_up_proj /
+    This layer contains both MergedColumnParallel weights (gate_up_proj / 
     w13) and RowParallelLinear weights (down_proj/ w2).
 
     Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
@@ -267,8 +260,8 @@ def weight_loader(
         else:
             # Input scales can be loaded directly and should be equal.
             if "input_scale" in weight_name:
-                if (param_data[expert_id] != 1 and
-                    (param_data[expert_id] - loaded_weight).abs() > 1e-5):
+                if (param_data[expert_id] != 1 and (param_data[expert_id] - 
+                                                    loaded_weight).abs() > 1e-5):
                     raise ValueError(
                         "input_scales of w1 and w3 of a layer "
                         f"must be equal. But got {param_data[expert_id]} "
@@ -322,8 +315,7 @@ def forward(self, hidden_states: torch.Tensor,
             renormalize=self.renormalize,
             use_grouped_topk=self.use_grouped_topk,
             num_expert_group=self.num_expert_group,
-            topk_group=self.topk_group,
-        )
+            topk_group=self.topk_group)
 
         if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -333,12 +325,9 @@ def forward(self, hidden_states: torch.Tensor,
 
     @classmethod
     def make_expert_params_mapping(
-        cls,
-        ckpt_gate_proj_name: str,
-        ckpt_down_proj_name: str,
-        ckpt_up_proj_name: str,
-        num_experts: int,
-    ) -> List[Tuple[str, str, int, int]]:
+            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
+            ckpt_up_proj_name: str,
+            num_experts: int) -> List[Tuple[str, str, int, str]]:
         gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
         gate_down_up = [
             ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name

From e29107f1f13c73d57f02bcee7ac8a433536c831d Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 8 Aug 2024 10:47:48 -0700
Subject: [PATCH 009/106] Final formatting change

---
 vllm/model_executor/layers/fused_moe/layer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 566a0a4f23be6..25c6214318692 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -146,7 +146,6 @@ def forward_tpu(
         topk_group: Optional[int],
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
-
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None

From 099d61e73f5ee36e3ebf1f2c753970efecbba39b Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 13:50:02 -0700
Subject: [PATCH 010/106] Switching to mixtral file for quantized mixtral

---
 vllm/model_executor/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 94c3cea98be7b..329df4830af41 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -48,7 +48,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),

From bdf6bdc31d9ba050b207589315fb0f3389da389b Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 14:42:45 -0700
Subject: [PATCH 011/106] Bug fixes

---
 tests/kernels/test_moe.py                     |  42 +++---
 .../layers/fused_moe/fused_moe_marlin.py      |  26 ++--
 vllm/model_executor/layers/fused_moe/layer.py |  11 +-
 .../layers/quantization/gptq_marlin.py        | 139 +++++++++---------
 vllm/model_executor/models/mixtral.py         |  10 +-
 5 files changed, 122 insertions(+), 106 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 856ee7c56e598..e657581df05a0 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -13,8 +13,7 @@
 from vllm.model_executor.layers.fused_moe import fused_moe, single_marlin_moe
 from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    marlin_quantize,
-)
+    marlin_quantize, )
 from vllm.model_executor.models.mixtral import MixtralMoE
 
 
@@ -29,12 +28,10 @@ def torch_moe(a, w1, w2, score, topk):
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
-                0, 1
-            )
-    return (
-        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
-    ).sum(dim=1)
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
 def torch_moe_single(a, w, score, topk):
@@ -75,7 +72,8 @@ def test_fused_moe(
     assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
 
 
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
 def test_mixtral_moe(dtype: torch.dtype):
     """Make sure our Mixtral MoE implementation agrees with the one from
@@ -96,7 +94,8 @@ def test_mixtral_moe(dtype: torch.dtype):
     # Load the weights
     vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
     for i in range(config.num_local_experts):
-        weights = (hf_moe.experts[i].w1.weight.data, hf_moe.experts[i].w3.weight.data)
+        weights = (hf_moe.experts[i].w1.weight.data,
+                   hf_moe.experts[i].w3.weight.data)
         vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
         vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
 
@@ -130,8 +129,7 @@ def stack_and_dev(tensors: List[torch.Tensor]):
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref)
-    )
+        torch.abs(output_ref))
 
 
 # TODO: make sure this test works
@@ -184,8 +182,7 @@ def test_fused_marlin_moe(
     for i in range(w1.shape[0]):
         test_perm = torch.randperm(k)
         w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
-            w1[i].transpose(1, 0), num_bits, group_size, act_order, test_perm
-        )
+            w1[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
         w_ref1_l.append(w_ref1)
         qweight1_l.append(qweight1)
         scales1_l.append(scales1)
@@ -207,8 +204,7 @@ def test_fused_marlin_moe(
     for i in range(w2.shape[0]):
         test_perm = torch.randperm(n)
         w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
-            w2[i].transpose(1, 0), num_bits, group_size, act_order, test_perm
-        )
+            w2[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
         w_ref2_l.append(w_ref2)
         qweight2_l.append(qweight2)
         scales2_l.append(scales2)
@@ -294,8 +290,7 @@ def test_single_marlin_moe(
     for i in range(w.shape[0]):
         test_perm = torch.randperm(k)
         w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-            w[i].transpose(1, 0), num_bits, group_size, act_order, test_perm
-        )
+            w[i].transpose(1, 0), num_bits, group_size, act_order, test_perm)
         w_ref_l.append(w_ref)
         qweights_l.append(qweight)
         scales_l.append(scales)
@@ -309,9 +304,14 @@ def test_single_marlin_moe(
     sort_indices = stack_and_dev(sort_indices_l)
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-    marlin_output = single_marlin_moe(
-        a, qweight, scales, score, g_idx, sort_indices, topk, renormalize=False
-    )
+    marlin_output = single_marlin_moe(a,
+                                      qweight,
+                                      scales,
+                                      score,
+                                      g_idx,
+                                      sort_indices,
+                                      topk,
+                                      renormalize=False)
     torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
 
     assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 4ffcda6f85d5e..d84126568d726 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -51,19 +51,25 @@ def fused_moe_marlin(
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
     # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
-    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, renormalize)
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
@@ -81,9 +87,10 @@ def fused_moe_marlin(
     sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
     max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(
-        max_workspace_size, dtype=torch.int, device="cuda", requires_grad=False
-    )
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
 
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
@@ -135,4 +142,5 @@ def fused_moe_marlin(
         True,
     )
 
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 25c6214318692..214c40a510dfb 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -55,8 +55,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                    hidden_size: int, intermediate_size: int,
-                    params_dtype: torch.dtype, **extra_weight_attrs):
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
         # Fused gate_up_proj (column parallel)
         w13_weight = torch.nn.Parameter(
             torch.empty(num_experts,
@@ -151,6 +151,7 @@ def forward_tpu(
         assert topk_group is None
         return fused_moe(x, w1, w2, router_logits, top_k, renormalize)
 
+
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
@@ -259,8 +260,8 @@ def weight_loader(
         else:
             # Input scales can be loaded directly and should be equal.
             if "input_scale" in weight_name:
-                if (param_data[expert_id] != 1 and (param_data[expert_id] - 
-                                                    loaded_weight).abs() > 1e-5):
+                if (param_data[expert_id] != 1 and
+                    (param_data[expert_id] - loaded_weight).abs() > 1e-5):
                     raise ValueError(
                         "input_scales of w1 and w3 of a layer "
                         f"must be equal. But got {param_data[expert_id]} "
@@ -326,7 +327,7 @@ def forward(self, hidden_states: torch.Tensor,
     def make_expert_params_mapping(
             cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
             ckpt_up_proj_name: str,
-            num_experts: int) -> List[Tuple[str, str, int, str]]:
+            num_experts: int) -> List[Tuple[str, str, int, int]]:
         gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
         gate_down_up = [
             ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index c81517823cbf2..2088177418f1e 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -62,17 +62,15 @@ def __init__(
         self.lm_head_quantized = lm_head_quantized
 
         # Verify supported on platform.
-        verify_gptq_marlin_supported(
-            num_bits=self.weight_bits, group_size=self.group_size, is_sym=self.is_sym
-        )
+        verify_gptq_marlin_supported(num_bits=self.weight_bits,
+                                     group_size=self.group_size,
+                                     is_sym=self.is_sym)
 
     def __repr__(self) -> str:
-        return (
-            f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
-            f"group_size={self.group_size}, "
-            f"desc_act={self.desc_act}, "
-            f"lm_head_quantized={self.lm_head_quantized})"
-        )
+        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -96,40 +94,37 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         is_sym = cls.get_from_keys(config, ["sym"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
-        return cls(weight_bits, group_size, desc_act, is_sym, lm_head_quantized)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym,
+                   lm_head_quantized)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
-        is_valid_user_quant = (
-            user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin"
-        )
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
+                               or user_quant == "gptq_marlin")
 
         if can_convert and is_valid_user_quant:
-            msg = (
-                "The model is convertible to {} during runtime."
-                " Using {} kernel.".format(cls.get_name(), cls.get_name())
-            )
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
             logger.info(msg)
             return cls.get_name()
 
         if can_convert and user_quant == "gptq":
-            logger.info(
-                "Detected that the model can run with gptq_marlin"
-                ", however you specified quantization=gptq explicitly,"
-                " so forcing gptq. Use quantization=gptq_marlin for"
-                " faster inference"
-            )
+            logger.info("Detected that the model can run with gptq_marlin"
+                        ", however you specified quantization=gptq explicitly,"
+                        " so forcing gptq. Use quantization=gptq_marlin for"
+                        " faster inference")
         return None
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
-        if isinstance(layer, LinearBase) or (
-            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
-        ):
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
@@ -200,9 +195,9 @@ def create_weights(
         )
 
         # Determine sharding
-        if marlin_repeat_scales_on_all_ranks(
-            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
-        ):
+        if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                             self.quant_config.group_size,
+                                             is_row_parallel):
             # By setting scale_dim == None, weight_loader will
             # repeat the scales on each GPU in TP>1 case.
             scales_and_zp_input_dim = None
@@ -244,7 +239,10 @@ def create_weights(
         # Ignore warning from fused linear layers such as QKVParallelLinear.
         set_weight_attrs(
             g_idx,
-            {**extra_weight_attrs, "input_dim": 0, "ignore_warning": True},
+            {
+                **extra_weight_attrs, "input_dim": 0,
+                "ignore_warning": True
+            },
         )
 
         # Scales
@@ -293,7 +291,8 @@ def create_weights(
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
-        layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act, is_row_parallel)
+        layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act,
+                                           is_row_parallel)
 
     # Checkpoints are serialized in AutoGPTQ format, which is different from the
     # marlin format. This function is called after the weights are loaded.
@@ -302,7 +301,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         device = layer.qweight.device
 
         # Allocate marlin workspace
-        layer.workspace = marlin_make_workspace(layer.output_size_per_partition, device)
+        layer.workspace = marlin_make_workspace(
+            layer.output_size_per_partition, device)
 
         # Handle sorting for activation reordering if needed.
         if self.quant_config.desc_act:
@@ -329,11 +329,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Permute scales from autogptq format to marlin format.
         marlin_scales = marlin_permute_scales(
             layer.scales,
-            size_k=(
-                layer.input_size
-                if self.quant_config.desc_act
-                else layer.input_size_per_partition
-            ),
+            size_k=(layer.input_size if self.quant_config.desc_act else
+                    layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
             group_size=self.quant_config.group_size,
         )
@@ -411,16 +408,20 @@ def create_weights(
         set_weight_attrs(w2_qweight, extra_weight_attrs)
         # up_proj scales
         w13_scales = torch.nn.Parameter(
-            torch.empty(
-                num_experts, scales_size13, 2 * intermediate_size, dtype=params_dtype
-            ),
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size,
+                        dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w13_scales", w13_scales)
         set_weight_attrs(w13_scales, extra_weight_attrs)
         # down_proj scales
         w2_scales = torch.nn.Parameter(
-            torch.empty(num_experts, scales_size2, hidden_size, dtype=params_dtype),
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w2_scales", w2_scales)
@@ -453,7 +454,8 @@ def create_weights(
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
         set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
         w2_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty(
@@ -463,7 +465,8 @@ def create_weights(
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
         layer.marlin_state = GPTQMarlinState.REPACK
 
@@ -479,36 +482,42 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
             w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
             for e in range(num_experts):
-                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_g_idx[e]).to(
-                    torch.int32
-                )
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
                 w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
-                    torch.int32
-                )
-                w13_sorted_g_idx[e] = layer.w13_g_idx[e][w13_g_idx_sort_indices[e]]
-                w2_sorted_g_idx[e] = layer.w2_g_idx[e][w2_g_idx_sort_indices[e]]
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
             replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx)
             replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx)
-            replace_tensor(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
-            replace_tensor(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+            replace_tensor(layer, "w13_g_idx_sort_indices",
+                           w13_g_idx_sort_indices)
+            replace_tensor(layer, "w2_g_idx_sort_indices",
+                           w2_g_idx_sort_indices)
         else:
             # Reset g_idx related tensors
             num_experts = layer.w13_g_idx.shape[0]
             device = layer.w13_g_idx.device
             layer.w13_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
                 requires_grad=False,
             )
             layer.w2_g_idx = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
                 requires_grad=False,
             )
             layer.w13_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
                 requires_grad=False,
             )
             layer.w2_g_idx_sort_indices = torch.nn.Parameter(
-                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
                 requires_grad=False,
             )
         # Repack weights
@@ -530,11 +539,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
-            size_k=(
-                layer.input_size
-                if self.quant_config.desc_act
-                else layer.input_size_per_partition
-            ),
+            size_k=(layer.input_size if self.quant_config.desc_act else
+                    layer.input_size_per_partition),
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
         )
@@ -542,11 +548,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-            size_n=(
-                layer.input_size
-                if self.quant_config.desc_act
-                else layer.input_size_per_partition
-            ),
+            size_n=(layer.input_size if self.quant_config.desc_act else
+                    layer.input_size_per_partition),
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w2_scales", marlin_w2_scales)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 8fbd537a2c031..d5c4256ded522 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -437,7 +437,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader(param,
+                              loaded_weight,
+                              shard_id,
+                              is_quantized=True)
                 break
             else:
                 for mapping in expert_params_mapping:
@@ -454,7 +457,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                   loaded_weight,
                                   weight_name,
                                   shard_id=shard_id,
-                                  expert_id=expert_id)
+                                  expert_id=expert_id,
+                                  is_quantized=True)
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
@@ -471,4 +475,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    weight_loader(param, loaded_weight)
+                    weight_loader(param, loaded_weight, is_quantized=True)

From 19c5c59d82b8329be8e1cff2f0b51e40e83e83a3 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 14:45:20 -0700
Subject: [PATCH 012/106] is quantized change

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index d5c4256ded522..c36f86f2d65f0 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -475,4 +475,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    weight_loader(param, loaded_weight, is_quantized=True)
+                    weight_loader(param, loaded_weight)

From 3b7cc60a50478c180b0a48744fa37aab3084e413 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 14:56:12 -0700
Subject: [PATCH 013/106] debug stat

---
 vllm/model_executor/models/mixtral.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c36f86f2d65f0..341ff806f37d5 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -50,7 +50,8 @@
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
 
-
+import logging
+logger = logging.getLogger(__name__)
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
     across all ranks.
@@ -451,6 +452,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    logger.error(params_dict.keys())
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,

From d2c4754df8418439acb7406acbb51e66461b6aa2 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 14:59:35 -0700
Subject: [PATCH 014/106] replace wiehgt name with param name

---
 vllm/model_executor/layers/fused_moe/__init__.py | 2 +-
 vllm/model_executor/models/mixtral.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index beb94f10a557e..0d871232305ae 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,4 +1,4 @@
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
+sfrom vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
 from vllm.model_executor.layers.fused_moe.fused_moe import single_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
 from vllm.triton_utils import HAS_TRITON
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 341ff806f37d5..e41250cf99707 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -457,7 +457,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = param.weight_loader
                     weight_loader(param,
                                   loaded_weight,
-                                  weight_name,
+                                  param_name,
                                   shard_id=shard_id,
                                   expert_id=expert_id,
                                   is_quantized=True)

From f579cb25242aa33514e7fe27068e2000e87d6141 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 15:00:26 -0700
Subject: [PATCH 015/106] typo

---
 vllm/model_executor/layers/fused_moe/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 0d871232305ae..beb94f10a557e 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,4 +1,4 @@
-sfrom vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
 from vllm.model_executor.layers.fused_moe.fused_moe import single_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
 from vllm.triton_utils import HAS_TRITON

From 79394eb8e23fb703a12a9a28fdf283dfa2599080 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 15:08:05 -0700
Subject: [PATCH 016/106] debug

---
 vllm/model_executor/models/mixtral.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e41250cf99707..40948bfe1f0b5 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -444,6 +444,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                               is_quantized=True)
                 break
             else:
+                logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:

From ec75f4ef3c036e2a2e794098cca804956c5c9b96 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 15:09:53 -0700
Subject: [PATCH 017/106] more debug

---
 vllm/model_executor/models/mixtral.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 40948bfe1f0b5..5d2a07a7b50f0 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -449,7 +449,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
+                    logger.error(weight_name, param_name, name, name.replace(weight_name, param_name))
                     name = name.replace(weight_name, param_name)
+                    logger.error(name in params_dict)
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue

From 91ca97078a59d2722965599feb4235804fbd5e1f Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 15:11:05 -0700
Subject: [PATCH 018/106] only relevant logging

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 5d2a07a7b50f0..1cfa219f9d268 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -444,7 +444,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                               is_quantized=True)
                 break
             else:
-                logger.error(expert_params_mapping)
+                # logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
@@ -455,7 +455,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
-                    logger.error(params_dict.keys())
+                    # logger.error(params_dict.keys())
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,

From 1b9d5bb25d68fee931803789251b93bc35d8fb82 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 15:12:54 -0700
Subject: [PATCH 019/106] log

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 1cfa219f9d268..83277083e24fa 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -449,9 +449,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
-                    logger.error(weight_name, param_name, name, name.replace(weight_name, param_name))
+                    logger.error(weight_name, param_name, name)
                     name = name.replace(weight_name, param_name)
-                    logger.error(name in params_dict)
+                    logger.error(f"Loading {name} from {weight_name}")
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue

From ec0671913b4d72ad8bde7f2ee695c91a7ac6c311 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 15:14:03 -0700
Subject: [PATCH 020/106] log

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 83277083e24fa..71e68129e9f80 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -449,7 +449,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
-                    logger.error(weight_name, param_name, name)
+                    logger.error(f"{weight_name} {param_name} {name}")
                     name = name.replace(weight_name, param_name)
                     logger.error(f"Loading {name} from {weight_name}")
                     # Skip layers on other devices.

From 71d82e125319878825e8c9bed84143b95af7cc4a Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 15:31:23 -0700
Subject: [PATCH 021/106] removing qzero weights

---
 .../layers/quantization/gptq_marlin.py        | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 2088177418f1e..542c0244f431f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -263,31 +263,31 @@ def create_weights(
             },
         )
 
-        # Quantized zero-points
-        qzeros = Parameter(
-            torch.empty(
-                scales_and_zp_size,
-                output_size_per_partition // self.quant_config.pack_factor,
-                dtype=torch.int32,
-                device="meta",
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qzeros,
-            {
-                **extra_weight_attrs,
-                "input_dim": scales_and_zp_input_dim,
-                "output_dim": 1,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
+        # # Quantized zero-points
+        # qzeros = Parameter(
+        #     torch.empty(
+        #         scales_and_zp_size,
+        #         output_size_per_partition // self.quant_config.pack_factor,
+        #         dtype=torch.int32,
+        #         device="meta",
+        #     ),
+        #     requires_grad=False,
+        # )
+        # set_weight_attrs(
+        #     qzeros,
+        #     {
+        #         **extra_weight_attrs,
+        #         "input_dim": scales_and_zp_input_dim,
+        #         "output_dim": 1,
+        #         "packed_dim": 1,
+        #         "pack_factor": self.quant_config.pack_factor,
+        #     },
+        # )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("g_idx", g_idx)
         layer.register_parameter("scales", scales)
-        layer.register_parameter("qzeros", qzeros)
+        # layer.register_parameter("qzeros", qzeros)
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size

From d3465d07813a09747d5d439b16b57cdbd7c2d566 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:02:10 -0700
Subject: [PATCH 022/106] Qzeors in expert mapping

---
 vllm/model_executor/layers/fused_moe/layer.py | 11 ++++
 .../layers/quantization/gptq_marlin.py        | 62 ++++++++++++-------
 2 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 214c40a510dfb..d89867090ec49 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -388,4 +388,15 @@ def make_expert_params_mapping(
                 shard_id,
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the qzeros for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                "experts.w13_qzeros"
+                if weight_name in gate_up else "experts.w2_qzeros",
+                f"experts.{expert_id}.{weight_name}.qzeros",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
         ])
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 542c0244f431f..9b196477fbd21 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -263,31 +263,31 @@ def create_weights(
             },
         )
 
-        # # Quantized zero-points
-        # qzeros = Parameter(
-        #     torch.empty(
-        #         scales_and_zp_size,
-        #         output_size_per_partition // self.quant_config.pack_factor,
-        #         dtype=torch.int32,
-        #         device="meta",
-        #     ),
-        #     requires_grad=False,
-        # )
-        # set_weight_attrs(
-        #     qzeros,
-        #     {
-        #         **extra_weight_attrs,
-        #         "input_dim": scales_and_zp_input_dim,
-        #         "output_dim": 1,
-        #         "packed_dim": 1,
-        #         "pack_factor": self.quant_config.pack_factor,
-        #     },
-        # )
+        # Quantized zero-points
+        qzeros = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+                device="meta",
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qzeros,
+            {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            },
+        )
 
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("g_idx", g_idx)
         layer.register_parameter("scales", scales)
-        # layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("qzeros", qzeros)
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
@@ -426,6 +426,26 @@ def create_weights(
         )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
         w13_g_idx = torch.nn.Parameter(
             torch.empty(
                 num_experts,

From 226ee265d6daf964cad6a22536cbf23e3fcc68da Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:04:26 -0700
Subject: [PATCH 023/106] Debug

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 71e68129e9f80..054f8a48593c6 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -444,7 +444,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                               is_quantized=True)
                 break
             else:
-                # logger.error(expert_params_mapping)
+                logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
@@ -455,7 +455,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
-                    # logger.error(params_dict.keys())
+                    logger.error(params_dict.keys())
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,

From 21d7d27de837d05c0368addfd6d19fcaf23ea7ee Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:07:09 -0700
Subject: [PATCH 024/106] Load qzero

---
 vllm/model_executor/layers/fused_moe/layer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d89867090ec49..24120b2d5b2f2 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -255,6 +255,11 @@ def weight_loader(
                     raise ValueError(f"Invalid weight name: {weight_name}: "
                                      "must contain 'w13' or 'w2'.")
                 param_data[expert_id] = loaded_weight
+            elif "qzeros" in weight_name:
+                if "w13" not in weight_name and "w2" not in weight_name:
+                    raise ValueError(f"Invalid weight name: {weight_name}: "
+                                     "must contain 'w13' or 'w2'.")
+                param_data[expert_id] = loaded_weight
             else:
                 raise ValueError(f"Invalid weight name: {weight_name}.")
         else:

From 2dabb4b9aec04ec601c0d4879a21dd0d5fcce0a7 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:12:29 -0700
Subject: [PATCH 025/106] rm 2x

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9b196477fbd21..efc854fe72e36 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -430,7 +430,7 @@ def create_weights(
         w13_qzeros = torch.nn.Parameter(
             torch.empty(num_experts,
                         scales_size13,
-                        2 * intermediate_size // self.quant_config.pack_factor,
+                        intermediate_size // self.quant_config.pack_factor,
                         dtype=params_dtype),
             requires_grad=False,
         )

From 63669768bfcabafe8eda18ff60432337ee6a2889 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:15:05 -0700
Subject: [PATCH 026/106] Mapping for scales

---
 vllm/model_executor/layers/fused_moe/layer.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 24120b2d5b2f2..60e9912c6b6ad 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -360,6 +360,17 @@ def make_expert_params_mapping(
                 shard_id,
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the weights for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                "experts.w13_scales"
+                if weight_name in gate_up else "experts.w2_scales",
+                f"experts.{expert_id}.{weight_name}.scales",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
             # These are the weight scales for the experts
             # (param_name, weight_name, expert_id, shard_id)

From d63c0966ed8d8d30a97fdd259c150dcd4a56fdb8 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:15:54 -0700
Subject: [PATCH 027/106] rm logging

---
 vllm/model_executor/models/mixtral.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 054f8a48593c6..c9f0d872e7cc2 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -444,18 +444,18 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                               is_quantized=True)
                 break
             else:
-                logger.error(expert_params_mapping)
+                # logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
-                    logger.error(f"{weight_name} {param_name} {name}")
+                    # logger.error(f"{weight_name} {param_name} {name}")
                     name = name.replace(weight_name, param_name)
-                    logger.error(f"Loading {name} from {weight_name}")
+                    # logger.error(f"Loading {name} from {weight_name}")
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
-                    logger.error(params_dict.keys())
+                    # logger.error(params_dict.keys())
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,

From 360fef4273a05b936a06f92b6c7edf81a36ac2d9 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:16:14 -0700
Subject: [PATCH 028/106] Adding lyaer wise logging

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c9f0d872e7cc2..ec7019a39d218 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -451,7 +451,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         continue
                     # logger.error(f"{weight_name} {param_name} {name}")
                     name = name.replace(weight_name, param_name)
-                    # logger.error(f"Loading {name} from {weight_name}")
+                    logger.error(f"Loading {name} from {weight_name}")
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue

From c23d6169261c40136b5501957d9ff71064ef5bd8 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:40:25 -0700
Subject: [PATCH 029/106] shard ids

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ec7019a39d218..4a76689855123 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -444,7 +444,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                               is_quantized=True)
                 break
             else:
-                # logger.error(expert_params_mapping)
+                logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:

From 8d81d14fc3e9b462410f46a369ea656e27883a59 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:45:51 -0700
Subject: [PATCH 030/106] Loading qzero correctly

---
 vllm/model_executor/layers/fused_moe/layer.py         | 11 +++--------
 .../model_executor/layers/quantization/gptq_marlin.py |  2 +-
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 60e9912c6b6ad..41b24bbc5ee41 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -235,16 +235,16 @@ def weight_loader(
         param_data = param.data
 
         if is_quantized:
-            if "_qweight" in weight_name or "_scales" in weight_name:
+            if ["_qweight", "_scales", "_qzeros"] in weight_name:
                 if "w13" in weight_name:
                     shard_size = self.intermediate_size_per_partition
                     if shard_id == 0:
                         param_data[expert_id, :, :shard_size] = loaded_weight
-                    elif shard_id == 1:
+                    elif shard_id == 2:
                         param_data[expert_id, :, shard_size:] = loaded_weight
                     else:
                         raise ValueError(f"Invalid shard_id: {shard_id}: "
-                                         "must be 0 or 1.")
+                                         "must be 0 or 2.")
                 elif "w2" in weight_name:
                     param_data[expert_id][:] = loaded_weight
                 else:
@@ -255,11 +255,6 @@ def weight_loader(
                     raise ValueError(f"Invalid weight name: {weight_name}: "
                                      "must contain 'w13' or 'w2'.")
                 param_data[expert_id] = loaded_weight
-            elif "qzeros" in weight_name:
-                if "w13" not in weight_name and "w2" not in weight_name:
-                    raise ValueError(f"Invalid weight name: {weight_name}: "
-                                     "must contain 'w13' or 'w2'.")
-                param_data[expert_id] = loaded_weight
             else:
                 raise ValueError(f"Invalid weight name: {weight_name}.")
         else:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index efc854fe72e36..9b196477fbd21 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -430,7 +430,7 @@ def create_weights(
         w13_qzeros = torch.nn.Parameter(
             torch.empty(num_experts,
                         scales_size13,
-                        intermediate_size // self.quant_config.pack_factor,
+                        2 * intermediate_size // self.quant_config.pack_factor,
                         dtype=params_dtype),
             requires_grad=False,
         )

From 22e1aa7b6f1b1ced2323a7c2c12437fb7208d36e Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:46:33 -0700
Subject: [PATCH 031/106] List operand

---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 41b24bbc5ee41..a1508bb4b34e2 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -235,7 +235,7 @@ def weight_loader(
         param_data = param.data
 
         if is_quantized:
-            if ["_qweight", "_scales", "_qzeros"] in weight_name:
+            if weight_name in ["_qweight", "_scales", "_qzeros"]:
                 if "w13" in weight_name:
                     shard_size = self.intermediate_size_per_partition
                     if shard_id == 0:

From 81e01f383bc84aed8ce4fae093cb7d3fe50188a3 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:49:09 -0700
Subject: [PATCH 032/106] If clause

---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index a1508bb4b34e2..4dbcdd57f8337 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -235,7 +235,7 @@ def weight_loader(
         param_data = param.data
 
         if is_quantized:
-            if weight_name in ["_qweight", "_scales", "_qzeros"]:
+            if "_qweight" in weight_name or "_scales" in weight_name or "_qzeros" in weight_name:
                 if "w13" in weight_name:
                     shard_size = self.intermediate_size_per_partition
                     if shard_id == 0:

From dcfd32d1aeccb98dec208d474d5020c7d21e7cf1 Mon Sep 17 00:00:00 2001
From: Dhruva Bansal <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 23:57:20 +0000
Subject: [PATCH 033/106] Able to load layers

---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 4dbcdd57f8337..686d0415c5e5c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -237,7 +237,7 @@ def weight_loader(
         if is_quantized:
             if "_qweight" in weight_name or "_scales" in weight_name or "_qzeros" in weight_name:
                 if "w13" in weight_name:
-                    shard_size = self.intermediate_size_per_partition
+                    shard_size = loaded_weight.size()[-1]
                     if shard_id == 0:
                         param_data[expert_id, :, :shard_size] = loaded_weight
                     elif shard_id == 2:

From f04cbeaaaceb777515528a7cc66657499c478b17 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:58:44 -0700
Subject: [PATCH 034/106] Setting load quant to false

---
 vllm/model_executor/models/mixtral.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 4a76689855123..2043580ab8f1c 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -440,8 +440,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = param.weight_loader
                 weight_loader(param,
                               loaded_weight,
-                              shard_id,
-                              is_quantized=True)
+                              shard_id)
                 break
             else:
                 logger.error(expert_params_mapping)

From a56821d352438f24dfedfc2413901c9c511d88f1 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 16:59:44 -0700
Subject: [PATCH 035/106] Disabling logging

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 2043580ab8f1c..011b10583c528 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -443,14 +443,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                               shard_id)
                 break
             else:
-                logger.error(expert_params_mapping)
+                # logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
                     # logger.error(f"{weight_name} {param_name} {name}")
                     name = name.replace(weight_name, param_name)
-                    logger.error(f"Loading {name} from {weight_name}")
+                    # logger.error(f"Loading {name} from {weight_name}")
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue

From 7f961c6d4d013b281a22dac79b6b073fdf8a122c Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:19:12 -0700
Subject: [PATCH 036/106] Removing *2 in marlin moe repack

---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 048ab9195d24e..f9f4b9c725dda 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,7 +283,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    output = torch.empty((num_experts, size_k // 16, size_n * 2),
+    output = torch.empty((num_experts, size_k // 16, size_n),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):

From 4a6c7ffc776b1c8580ada27c3148172d24d1c902 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:28:11 -0700
Subject: [PATCH 037/106] *4 in marlin moe repack

---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index f9f4b9c725dda..1458103fa1b6d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,7 +283,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    output = torch.empty((num_experts, size_k // 16, size_n),
+    output = torch.empty((num_experts, size_k // 16, size_n * 4),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):

From e6cd286d44d27db250737bc22a086e8a50082117 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:29:36 -0700
Subject: [PATCH 038/106] bits

---
 vllm/_custom_ops.py                                    | 2 +-
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 1458103fa1b6d..048ab9195d24e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,7 +283,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    output = torch.empty((num_experts, size_k // 16, size_n * 4),
+    output = torch.empty((num_experts, size_k // 16, size_n * 2),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9b196477fbd21..6133a4e172f4c 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -545,7 +545,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_qweight,
             layer.w13_g_idx_sort_indices,
             layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
-            layer.w13_qweight.shape[2],
+            layer.w13_qweight.shape[2] * 2,
             self.quant_config.weight_bits,
         )
         replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
@@ -554,6 +554,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_g_idx_sort_indices,
             layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w2_qweight.shape[2],
+            self.quant_config.weight_bits,
         )
         replace_tensor(layer, "w2_qweight", marlin_w2_qweight)
         # Repack scales

From 90241c4e4aa7532cf0f29211cf7aa3c700974843 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:30:28 -0700
Subject: [PATCH 039/106] *4

---
 vllm/_custom_ops.py                                    | 2 +-
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 048ab9195d24e..1458103fa1b6d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,7 +283,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    output = torch.empty((num_experts, size_k // 16, size_n * 2),
+    output = torch.empty((num_experts, size_k // 16, size_n * 4),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 6133a4e172f4c..ca756d652f928 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -545,7 +545,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_qweight,
             layer.w13_g_idx_sort_indices,
             layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
-            layer.w13_qweight.shape[2] * 2,
+            layer.w13_qweight.shape[2],
             self.quant_config.weight_bits,
         )
         replace_tensor(layer, "w13_qweight", marlin_w13_qweight)

From 67409e93237025e6db6027934711da1da87fd878 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:38:25 -0700
Subject: [PATCH 040/106] intermediate size

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index ca756d652f928..70b31a066740b 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -560,8 +560,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
-            size_k=(layer.input_size if self.quant_config.desc_act else
-                    layer.input_size_per_partition),
+            size_k=(layer.intermediate_size if self.quant_config.desc_act else
+                    layer.intermediate_size_per_partition),
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
         )
@@ -569,8 +569,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-            size_n=(layer.input_size if self.quant_config.desc_act else
-                    layer.input_size_per_partition),
+            size_k=(layer.intermediate_size if self.quant_config.desc_act else
+                    layer.intermediate_size_per_partition),
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w2_scales", marlin_w2_scales)

From 539032ef58cfad750d93365f1e91c64464505081 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:38:59 -0700
Subject: [PATCH 041/106] repeat keyword

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 70b31a066740b..9adeac6f96417 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -569,7 +569,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-            size_k=(layer.intermediate_size if self.quant_config.desc_act else
+            size_n=(layer.intermediate_size if self.quant_config.desc_act else
                     layer.intermediate_size_per_partition),
             group_size=self.quant_config.group_size,
         )

From 57b1cbe81846613b46f55fcbf7f41712a1ad4556 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:41:06 -0700
Subject: [PATCH 042/106] hidden size

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9adeac6f96417..78378b271f1de 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -569,8 +569,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-            size_n=(layer.intermediate_size if self.quant_config.desc_act else
-                    layer.intermediate_size_per_partition),
+            size_n=layer.hidden_size,
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w2_scales", marlin_w2_scales)

From 87f1dd4cd3a2679b764105850ba60e2bbb1e1999 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:50:47 -0700
Subject: [PATCH 043/106] intermediate size back

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 78378b271f1de..9adeac6f96417 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -569,7 +569,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-            size_n=layer.hidden_size,
+            size_n=(layer.intermediate_size if self.quant_config.desc_act else
+                    layer.intermediate_size_per_partition),
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w2_scales", marlin_w2_scales)

From 4c073c2a99c9030c705fb9870f73e3d09560f74f Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 17:59:44 -0700
Subject: [PATCH 044/106] permute scales w3

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9adeac6f96417..52bef66f57294 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -568,7 +568,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         replace_tensor(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
-            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_k=layer.w2_scales.shape[1],
             size_n=(layer.intermediate_size if self.quant_config.desc_act else
                     layer.intermediate_size_per_partition),
             group_size=self.quant_config.group_size,

From d73249346a57999c6639b689d90085f2088f1586 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 18:00:34 -0700
Subject: [PATCH 045/106] *2

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 52bef66f57294..617628d268f11 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -568,7 +568,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         replace_tensor(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
-            size_k=layer.w2_scales.shape[1],
+            size_k=2 * layer.w2_scales.shape[1] * self.quant_config.pack_factor,
             size_n=(layer.intermediate_size if self.quant_config.desc_act else
                     layer.intermediate_size_per_partition),
             group_size=self.quant_config.group_size,

From fdc22c4ef1a3bc8afce41c25ad8ec19acc4bcac7 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 18:06:12 -0700
Subject: [PATCH 046/106] log

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 617628d268f11..fe2a8c7d2f6ca 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -566,9 +566,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w13_scales", marlin_w13_scales)
+        logger.error(f"{layer.w2_scales.size()}, {layer.intermediate_size_per_partition}, {self.quant_config.group_size}")
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
-            size_k=2 * layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
             size_n=(layer.intermediate_size if self.quant_config.desc_act else
                     layer.intermediate_size_per_partition),
             group_size=self.quant_config.group_size,

From 272822eafbec55b0f39415770f009177339c7986 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 18:10:05 -0700
Subject: [PATCH 047/106] shape as 2

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index fe2a8c7d2f6ca..911fbb154fce6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -570,8 +570,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
-            size_n=(layer.intermediate_size if self.quant_config.desc_act else
-                    layer.intermediate_size_per_partition),
+            size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w2_scales", marlin_w2_scales)

From 3ce045e5c19fd562d7f69d5a0ae06fc422317017 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 18:21:08 -0700
Subject: [PATCH 048/106] test

---
 vllm/_custom_ops.py                              |  2 +-
 .../layers/quantization/gptq_marlin.py           | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 1458103fa1b6d..048ab9195d24e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,7 +283,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    output = torch.empty((num_experts, size_k // 16, size_n * 4),
+    output = torch.empty((num_experts, size_k // 16, size_n * 2),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 911fbb154fce6..70773505dcb2d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -541,14 +541,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 requires_grad=False,
             )
         # Repack weights
-        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
-            layer.w13_qweight,
-            layer.w13_g_idx_sort_indices,
-            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
-            layer.w13_qweight.shape[2],
-            self.quant_config.weight_bits,
-        )
-        replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
+        # marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+        #     layer.w13_qweight,
+        #     layer.w13_g_idx_sort_indices,
+        #     layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+        #     layer.w13_qweight.shape[2],
+        #     self.quant_config.weight_bits,
+        # )
+        # replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
             layer.w2_qweight,
             layer.w2_g_idx_sort_indices,

From c4ba4779bd44f147ea8b745106524f45b3db32f1 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 12 Aug 2024 23:17:10 -0700
Subject: [PATCH 049/106] Increasing to 4 and changing assert

---
 vllm/_custom_ops.py                              |  2 +-
 .../layers/fused_moe/fused_moe_marlin.py         |  2 +-
 .../layers/quantization/gptq_marlin.py           | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 048ab9195d24e..1458103fa1b6d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,7 +283,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    output = torch.empty((num_experts, size_k // 16, size_n * 2),
+    output = torch.empty((num_experts, size_k // 16, size_n * 4),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index d84126568d726..75e02af9d77af 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -56,7 +56,7 @@ def fused_moe_marlin(
     assert hidden_states.shape[
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
     assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+        1] == w2.shape[2] // 4, "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 70773505dcb2d..911fbb154fce6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -541,14 +541,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 requires_grad=False,
             )
         # Repack weights
-        # marlin_w13_qweight = ops.gptq_marlin_moe_repack(
-        #     layer.w13_qweight,
-        #     layer.w13_g_idx_sort_indices,
-        #     layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
-        #     layer.w13_qweight.shape[2],
-        #     self.quant_config.weight_bits,
-        # )
-        # replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.weight_bits,
+        )
+        replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
             layer.w2_qweight,
             layer.w2_g_idx_sort_indices,

From 2ea8370e136d58c085b703f779f6f029649f8fe0 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 12:00:07 -0700
Subject: [PATCH 050/106] logging

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 911fbb154fce6..c07d7b10ef193 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -540,6 +540,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                             device=device),
                 requires_grad=False,
             )
+        logger.error(f"W13 qweight size - {layer.w13_qweight.size()}")
+        logger.error(f"Quant Config: {self.quant_config}")
         # Repack weights
         marlin_w13_qweight = ops.gptq_marlin_moe_repack(
             layer.w13_qweight,

From 8287025224cda4529d624a8c2aceebaecec4ddd6 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 12:32:13 -0700
Subject: [PATCH 051/106] marlin moe repack change

---
 vllm/_custom_ops.py                                      | 8 +++++++-
 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py | 2 +-
 vllm/model_executor/layers/quantization/gptq_marlin.py   | 8 ++++----
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 1458103fa1b6d..37787aa6d3b2f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,7 +283,13 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    output = torch.empty((num_experts, size_k // 16, size_n * 4),
+    # output = torch.empty((num_experts, size_k // 16, size_n * 2),
+    #                      device=b_q_weight.device,
+    #                      dtype=b_q_weight.dtype)
+    # for e in range(num_experts):
+    #     output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e],
+    #                                                 size_k, size_n, num_bits)
+    output = torch.empty((num_experts, size_k, size_n),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 75e02af9d77af..d84126568d726 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -56,7 +56,7 @@ def fused_moe_marlin(
     assert hidden_states.shape[
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
     assert hidden_states.shape[
-        1] == w2.shape[2] // 4, "Hidden size mismatch w2"
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index c07d7b10ef193..ac5e018193a97 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -491,7 +491,6 @@ def create_weights(
         layer.marlin_state = GPTQMarlinState.REPACK
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.marlin_state = GPTQMarlinState.READY
 
         # Process act_order
         if self.quant_config.desc_act:
@@ -546,7 +545,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w13_qweight = ops.gptq_marlin_moe_repack(
             layer.w13_qweight,
             layer.w13_g_idx_sort_indices,
-            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[1],
             layer.w13_qweight.shape[2],
             self.quant_config.weight_bits,
         )
@@ -554,7 +553,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
             layer.w2_qweight,
             layer.w2_g_idx_sort_indices,
-            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[1],
             layer.w2_qweight.shape[2],
             self.quant_config.weight_bits,
         )
@@ -568,7 +567,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w13_scales", marlin_w13_scales)
-        logger.error(f"{layer.w2_scales.size()}, {layer.intermediate_size_per_partition}, {self.quant_config.group_size}")
+        # logger.error(f"{layer.w2_scales.size()}, {layer.intermediate_size_per_partition}, {self.quant_config.group_size}")
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
@@ -576,6 +575,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w2_scales", marlin_w2_scales)
+        layer.marlin_state = GPTQMarlinState.READY
 
     def apply(
         self,

From 53b23b9525c474b655dd0fe1d0c6787a72bec60e Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 12:42:59 -0700
Subject: [PATCH 052/106] mult qweight shape by pack factor

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index ac5e018193a97..7b376a79004b2 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -545,7 +545,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w13_qweight = ops.gptq_marlin_moe_repack(
             layer.w13_qweight,
             layer.w13_g_idx_sort_indices,
-            layer.w13_qweight.shape[1],
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w13_qweight.shape[2],
             self.quant_config.weight_bits,
         )
@@ -553,7 +553,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
             layer.w2_qweight,
             layer.w2_g_idx_sort_indices,
-            layer.w2_qweight.shape[1],
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w2_qweight.shape[2],
             self.quant_config.weight_bits,
         )

From bc407861ecb5011f390507d31ed1b13318a08f67 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 12:59:52 -0700
Subject: [PATCH 053/106] Potential support for 8 bit

---
 vllm/_custom_ops.py                                      | 8 +-------
 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py | 9 +++++----
 vllm/model_executor/layers/quantization/gptq_marlin.py   | 1 +
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 37787aa6d3b2f..92c846cd685f6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -283,13 +283,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
-    # output = torch.empty((num_experts, size_k // 16, size_n * 2),
-    #                      device=b_q_weight.device,
-    #                      dtype=b_q_weight.dtype)
-    # for e in range(num_experts):
-    #     output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e],
-    #                                                 size_k, size_n, num_bits)
-    output = torch.empty((num_experts, size_k, size_n),
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index d84126568d726..44ea7299fe447 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -23,6 +23,7 @@ def fused_moe_marlin(
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
+    num_bits: int = 8,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -56,7 +57,7 @@ def fused_moe_marlin(
     assert hidden_states.shape[
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
     assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+        1] == w2.shape[2] // (num_bits // 2), "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
@@ -86,7 +87,7 @@ def fused_moe_marlin(
 
     sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
-    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    max_workspace_size = ((M + 255) // 256) * (max((num_bits // 2) * N, K) // 64) * 16
     workspace = torch.zeros(max_workspace_size,
                             dtype=torch.int,
                             device="cuda",
@@ -109,7 +110,7 @@ def fused_moe_marlin(
         rand_perm1,
         workspace,
         M,
-        2 * N,
+        (num_bits // 2) * N,
         K,
         True,
         E,
@@ -119,7 +120,7 @@ def fused_moe_marlin(
         False,
     )
 
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, (num_bits // 2) * N))
 
     intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
         intermediate_cache2,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 7b376a79004b2..1f178d828ec94 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -601,4 +601,5 @@ def apply(
             renormalize=renormalize,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
+            num_bits=self.quant_config.weight_bits,
         )

From bea13de41d2b4f7db75ea67893615c04f83a10f1 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 13:05:40 -0700
Subject: [PATCH 054/106] undo change

---
 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 44ea7299fe447..efafcef2f1ee7 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -87,7 +87,7 @@ def fused_moe_marlin(
 
     sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
-    max_workspace_size = ((M + 255) // 256) * (max((num_bits // 2) * N, K) // 64) * 16
+    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
     workspace = torch.zeros(max_workspace_size,
                             dtype=torch.int,
                             device="cuda",
@@ -110,7 +110,7 @@ def fused_moe_marlin(
         rand_perm1,
         workspace,
         M,
-        (num_bits // 2) * N,
+        2 * N,
         K,
         True,
         E,

From a3a9114b00109833a780e8b6f121a24a622d42b7 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:22:50 -0700
Subject: [PATCH 055/106] qzeros

---
 vllm/model_executor/layers/fused_moe/layer.py |  11 --
 vllm/model_executor/models/mixtral.py         |   7 +-
 vllm/model_executor/models/mixtral_quant.py   | 144 +++++++++++++-----
 3 files changed, 112 insertions(+), 50 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 686d0415c5e5c..53c682385e840 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -399,15 +399,4 @@ def make_expert_params_mapping(
                 shard_id,
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the qzeros for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.w13_qzeros"
-                if weight_name in gate_up else "experts.w2_qzeros",
-                f"experts.{expert_id}.{weight_name}.qzeros",
-                expert_id,
-                shard_id,
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
         ])
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 011b10583c528..38b9f4ee24c0c 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -446,7 +446,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
+                    if weight_name not in name or ".qzeros" in name:
                         continue
                     # logger.error(f"{weight_name} {param_name} {name}")
                     name = name.replace(weight_name, param_name)
@@ -466,7 +466,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
+                    if name.endswith(".bias") and name not in params_dict: 
+                        continue
+                    
+                    if ".qzeros" in name:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index cdfd24874b974..85dafd55bbcf8 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -21,6 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
+import re
 from typing import Iterable, List, Optional, Tuple
 
 import numpy as np
@@ -34,6 +35,7 @@
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
@@ -94,10 +96,13 @@ class MixtralMoE(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.config = config
+        self.use_fused_moe = use_fused_moe
+        self.quant_config = quant_config
         self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_total_experts = config.num_local_experts
@@ -113,14 +118,26 @@ def __init__(
             raise ValueError(
                 f"Rank {self.rank} has no experts assigned to it.")
 
-        self.experts = nn.ModuleList([
-            MixtralMLP(self.num_total_experts,
-                       config.hidden_size,
-                       config.intermediate_size,
-                       quant_config=quant_config)
-            if idx in self.expert_indicies else None
-            for idx in range(self.num_total_experts)
-        ])
+        if self.use_fused_moe:
+            params_dtype = torch.float16
+            self.experts = FusedMoE(num_experts=self.num_total_experts,
+                                    top_k=self.top_k,
+                                    hidden_size=config.hidden_size,
+                                    intermediate_size=config.intermediate_size,
+                                    params_dtype=params_dtype,
+                                    reduce_results=True,
+                                    renormalize=True,
+                                    quant_config=quant_config,
+                                    tp_size=self.tp_size)
+        else:
+            self.experts = nn.ModuleList([
+                MixtralMLP(self.num_total_experts,
+                           config.hidden_size,
+                           config.intermediate_size,
+                           quant_config=quant_config)
+                if idx in self.expert_indicies else None
+                for idx in range(self.num_total_experts)
+            ])
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      self.num_total_experts,
@@ -132,28 +149,34 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits, _ = self.gate(hidden_states)
 
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-        final_hidden_states = None
-        for expert_idx in self.expert_indicies:
-            expert_layer = self.experts[expert_idx]
-            expert_mask = (selected_experts == expert_idx)
-            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
-                                                                 keepdim=True)
-
-            current_hidden_states = expert_layer(hidden_states).mul_(
-                expert_weights)
-            if final_hidden_states is None:
-                final_hidden_states = current_hidden_states
-            else:
-                final_hidden_states.add_(current_hidden_states)
-
-        return tensor_model_parallel_all_reduce(final_hidden_states).view(
-            num_tokens, hidden_dim)
+        if self.use_fused_moe:
+            ret = self.experts(hidden_states.half(), router_logits)
+            return ret.bfloat16()
+        else:
+            routing_weights = F.softmax(router_logits,
+                                        dim=1,
+                                        dtype=torch.float)
+            routing_weights, selected_experts = torch.topk(routing_weights,
+                                                           self.top_k,
+                                                           dim=-1)
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+            final_hidden_states = None
+            for expert_idx in self.expert_indicies:
+                expert_layer = self.experts[expert_idx]
+                expert_mask = (selected_experts == expert_idx)
+                expert_weights = (routing_weights * expert_mask).sum(
+                    dim=-1, keepdim=True)
+
+                current_hidden_states = expert_layer(hidden_states).mul_(
+                    expert_weights)
+                if final_hidden_states is None:
+                    final_hidden_states = current_hidden_states
+                else:
+                    final_hidden_states.add_(current_hidden_states)
+
+            return tensor_model_parallel_all_reduce(final_hidden_states).view(
+                num_tokens, hidden_dim)
 
 
 class MixtralAttention(nn.Module):
@@ -238,6 +261,7 @@ class MixtralDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -254,6 +278,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config)
         self.block_sparse_moe = MixtralMoE(config=config,
+                                           use_fused_moe=use_fused_moe,
                                            quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -294,6 +319,7 @@ class MixtralModel(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -307,6 +333,7 @@ def __init__(
         )
         self.layers = nn.ModuleList([
             MixtralDecoderLayer(config,
+                                use_fused_moe,
                                 cache_config,
                                 quant_config=quant_config)
             for _ in range(config.num_hidden_layers)
@@ -343,10 +370,12 @@ def __init__(
         super().__init__()
 
         # TODO check runs with dtype=float16
+        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
 
         self.config = config
         self.quant_config = quant_config
-        self.model = MixtralModel(config, cache_config, quant_config)
+        self.model = MixtralModel(config, self.use_fused_moe, cache_config,
+                                  quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -407,9 +436,50 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
 
-                if ("block_sparse_moe.experts." in name
-                        and name not in params_dict):
-                    continue
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                if self.use_fused_moe:
+                    if ("block_sparse_moe.experts." in name
+                            and ".w1." not in name and ".w2." not in name
+                            and ".w3." not in name
+                            and name not in params_dict):
+                        continue
+
+                    if (".qzeros" in name):
+                        continue
+
+                    shard_id = None
+                    expert_id = 0
+
+                    has_any_numbered = (".qweight" in name or ".scales" in name
+                                        or ".g_idx" in name)
+                    if (has_any_numbered and (".w1." in name)):
+                        name = name.replace(".w1.", ".w13_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w2." in name)):
+                        name = name.replace(".w2.", ".w2_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w3." in name)):
+                        name = name.replace(".w3.", ".w13_")
+                        shard_id = 1
+
+                    exp_string = re.search(r"\.experts\.\d+.", name)
+                    if exp_string:
+                        exp_string = exp_string.group(0)
+                        expert_id = int(exp_string.split(".")[2])
+                        name = name.replace(exp_string, ".experts.")
+
+                else:
+                    if ("block_sparse_moe.experts." in name
+                            and name not in params_dict):
+                        continue
+
+                param = params_dict[name]
+
+                if self.use_fused_moe and shard_id is not None:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, name, shard_id,
+                                  expert_id, True)
+                else:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)

From eb916f9584f7056e01872e8219fa3399c356640e Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:25:12 -0700
Subject: [PATCH 056/106] switching traffic to mixtral quant

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++--
 vllm/model_executor/models/__init__.py                 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 1f178d828ec94..90762efef8108 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -539,8 +539,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                             device=device),
                 requires_grad=False,
             )
-        logger.error(f"W13 qweight size - {layer.w13_qweight.size()}")
-        logger.error(f"Quant Config: {self.quant_config}")
+        # logger.error(f"W13 qweight size - {layer.w13_qweight.size()}")
+        # logger.error(f"Quant Config: {self.quant_config}")
         # Repack weights
         marlin_w13_qweight = ops.gptq_marlin_moe_repack(
             layer.w13_qweight,
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 329df4830af41..94c3cea98be7b 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -48,7 +48,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),

From 017d6f80f1d3078ca1e705c948d2804ac9db0e94 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:28:16 -0700
Subject: [PATCH 057/106] compat

---
 vllm/model_executor/layers/fused_moe/layer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 53c682385e840..0b06ee86a308d 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -240,11 +240,11 @@ def weight_loader(
                     shard_size = loaded_weight.size()[-1]
                     if shard_id == 0:
                         param_data[expert_id, :, :shard_size] = loaded_weight
-                    elif shard_id == 2:
+                    elif shard_id == 2 or shard_id == 1:
                         param_data[expert_id, :, shard_size:] = loaded_weight
                     else:
                         raise ValueError(f"Invalid shard_id: {shard_id}: "
-                                         "must be 0 or 2.")
+                                         "must be 0, 1, or 2.")
                 elif "w2" in weight_name:
                     param_data[expert_id][:] = loaded_weight
                 else:

From eb9c0870afc85ad1db73f52d215164cf9a388cb0 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:42:31 -0700
Subject: [PATCH 058/106] Passing intermediate tensor into mixtral in quant
 file

---
 vllm/model_executor/models/mixtral.py       |  2 +-
 vllm/model_executor/models/mixtral_quant.py | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 38b9f4ee24c0c..a70b31f82c070 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -341,7 +341,7 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
-
+        self.quant_config = quant_config
         self.model = MixtralModel(config,
                                   cache_config,
                                   quant_config,
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 85dafd55bbcf8..9c8e2b395992f 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -391,8 +391,22 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
+    
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:

From ea3cf18c457305bc4b257b3dadf8047eea607f90 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:45:06 -0700
Subject: [PATCH 059/106] Removing intemediate tensors from forward

---
 vllm/model_executor/models/mixtral.py       | 23 ++++++---------------
 vllm/model_executor/models/mixtral_quant.py |  2 +-
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a70b31f82c070..0c729d96d5707 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -283,25 +283,14 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
     ) -> torch.Tensor:
-        if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
             layer = self.layers[i]
             hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+                                            kv_caches[i], attn_metadata,
+                                            residual)
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -373,7 +362,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 9c8e2b395992f..a379c3e0f1bb8 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -391,7 +391,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata)
         return hidden_states
     
     def make_empty_intermediate_tensors(

From 4f6b4caaf07c3b5af3e8cbc8a359fde77aee02b1 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:50:24 -0700
Subject: [PATCH 060/106] load weights from quant

---
 vllm/model_executor/models/mixtral.py       | 115 +++++++++++---------
 vllm/model_executor/models/mixtral_quant.py |  14 ---
 2 files changed, 61 insertions(+), 68 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0c729d96d5707..fbe8a3530116f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -283,14 +283,25 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+        if get_pp_group().is_first_rank:
+            hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i], attn_metadata,
-                                            residual)
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -362,7 +373,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -401,19 +412,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts)
-
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -421,54 +423,59 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param,
-                              loaded_weight,
-                              shard_id)
+                weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # logger.error(expert_params_mapping)
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name or ".qzeros" in name:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if self.use_fused_moe:
+                    if ("block_sparse_moe.experts." in name
+                            and ".w1." not in name and ".w2." not in name
+                            and ".w3." not in name
+                            and name not in params_dict):
                         continue
-                    # logger.error(f"{weight_name} {param_name} {name}")
-                    name = name.replace(weight_name, param_name)
-                    # logger.error(f"Loading {name} from {weight_name}")
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+
+                    if (".qzeros" in name):
                         continue
-                    # logger.error(params_dict.keys())
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  param_name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id,
-                                  is_quantized=True)
-                    break
+
+                    shard_id = None
+                    expert_id = 0
+
+                    has_any_numbered = (".qweight" in name or ".scales" in name
+                                        or ".g_idx" in name)
+                    if (has_any_numbered and (".w1." in name)):
+                        name = name.replace(".w1.", ".w13_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w2." in name)):
+                        name = name.replace(".w2.", ".w2_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w3." in name)):
+                        name = name.replace(".w3.", ".w13_")
+                        shard_id = 1
+
+                    exp_string = re.search(r"\.experts\.\d+.", name)
+                    if exp_string:
+                        exp_string = exp_string.group(0)
+                        expert_id = int(exp_string.split(".")[2])
+                        name = name.replace(exp_string, ".experts.")
+
                 else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict: 
-                        continue
-                    
-                    if ".qzeros" in name:
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
+                    if ("block_sparse_moe.experts." in name
+                            and name not in params_dict):
                         continue
 
-                    param = params_dict[name]
+                param = params_dict[name]
+
+                if self.use_fused_moe and shard_id is not None:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, name, shard_id,
+                                  expert_id, True)
+                else:
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index a379c3e0f1bb8..85dafd55bbcf8 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -393,20 +393,6 @@ def forward(
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
         return hidden_states
-    
-    def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:

From 7ec27d9722c404b1cd3c7872ba7e92c9a722ff96 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:53:09 -0700
Subject: [PATCH 061/106] Mixtral load weights change:

---
 vllm/model_executor/models/__init__.py |  2 +-
 vllm/model_executor/models/mixtral.py  | 79 +++++++++++---------------
 2 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 94c3cea98be7b..329df4830af41 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -48,7 +48,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index fbe8a3530116f..1b59b1f7c2852 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Inference-only Mixtral model."""
 from typing import Iterable, List, Optional, Tuple
-
+import re
 import torch
 from torch import nn
 from transformers import MixtralConfig
@@ -432,50 +432,39 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
 
-                if self.use_fused_moe:
-                    if ("block_sparse_moe.experts." in name
-                            and ".w1." not in name and ".w2." not in name
-                            and ".w3." not in name
-                            and name not in params_dict):
-                        continue
-
-                    if (".qzeros" in name):
-                        continue
-
-                    shard_id = None
-                    expert_id = 0
-
-                    has_any_numbered = (".qweight" in name or ".scales" in name
-                                        or ".g_idx" in name)
-                    if (has_any_numbered and (".w1." in name)):
-                        name = name.replace(".w1.", ".w13_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w2." in name)):
-                        name = name.replace(".w2.", ".w2_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w3." in name)):
-                        name = name.replace(".w3.", ".w13_")
-                        shard_id = 1
-
-                    exp_string = re.search(r"\.experts\.\d+.", name)
-                    if exp_string:
-                        exp_string = exp_string.group(0)
-                        expert_id = int(exp_string.split(".")[2])
-                        name = name.replace(exp_string, ".experts.")
-
-                else:
-                    if ("block_sparse_moe.experts." in name
-                            and name not in params_dict):
-                        continue
+                if ("block_sparse_moe.experts." in name
+                        and ".w1." not in name and ".w2." not in name
+                        and ".w3." not in name
+                        and name not in params_dict):
+                    continue
+
+                if (".qzeros" in name):
+                    continue
+
+                shard_id = None
+                expert_id = 0
+
+                has_any_numbered = (".qweight" in name or ".scales" in name
+                                    or ".g_idx" in name)
+                if (has_any_numbered and (".w1." in name)):
+                    name = name.replace(".w1.", ".w13_")
+                    shard_id = 0
+                if (has_any_numbered and (".w2." in name)):
+                    name = name.replace(".w2.", ".w2_")
+                    shard_id = 0
+                if (has_any_numbered and (".w3." in name)):
+                    name = name.replace(".w3.", ".w13_")
+                    shard_id = 1
+
+                exp_string = re.search(r"\.experts\.\d+.", name)
+                if exp_string:
+                    exp_string = exp_string.group(0)
+                    expert_id = int(exp_string.split(".")[2])
+                    name = name.replace(exp_string, ".experts.")
 
                 param = params_dict[name]
 
-                if self.use_fused_moe and shard_id is not None:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight, name, shard_id,
-                                  expert_id, True)
-                else:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight, name, shard_id,
+                            expert_id, True)

From aa1fe77b71954434747ac8a27dcbb03446fd4a8a Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Tue, 13 Aug 2024 14:54:44 -0700
Subject: [PATCH 062/106] none shard id change

---
 vllm/model_executor/models/mixtral.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 1b59b1f7c2852..18d377922ef03 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -464,7 +464,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
                 param = params_dict[name]
 
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight, name, shard_id,
-                            expert_id, True)
+                if shard_id is not None:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, name, shard_id,
+                                  expert_id, True)
+                else:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)

From ae8fb1542b9cac9c32a610a28f606f63da38ab4a Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 10:37:12 -0700
Subject: [PATCH 063/106] Use class from mixtral_quant

---
 vllm/model_executor/models/__init__.py |   2 +-
 vllm/model_executor/models/mixtral.py  | 403 +++++++++++++++++--------
 2 files changed, 279 insertions(+), 126 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 329df4830af41..72a13d13eb0d6 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -48,7 +48,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral", "QuantizedMixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 18d377922ef03..05bbabe3e9278 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -32,26 +32,31 @@
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
 
-import logging
-logger = logging.getLogger(__name__)
+
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
     across all ranks.
@@ -61,36 +66,42 @@ class MixtralMoE(nn.Module):
     across ranks.
     """
 
-    def __init__(self,
-                 num_experts: int,
-                 top_k: int,
-                 hidden_size: int,
-                 intermediate_size: int,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 tp_size: Optional[int] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.hidden_size = hidden_size
 
         # Gate always runs at half / full precision for now.
-        self.gate = ReplicatedLinear(hidden_size,
-                                     num_experts,
-                                     bias=False,
-                                     params_dtype=params_dtype,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
-
-        self.experts = FusedMoE(num_experts=num_experts,
-                                top_k=top_k,
-                                hidden_size=hidden_size,
-                                intermediate_size=intermediate_size,
-                                params_dtype=params_dtype,
-                                reduce_results=True,
-                                renormalize=True,
-                                quant_config=quant_config,
-                                tp_size=tp_size,
-                                prefix=f"{prefix}.experts")
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            prefix=f"{prefix}.experts",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -103,7 +114,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class MixtralAttention(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -160,12 +170,14 @@ def __init__(
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config)
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
 
     def forward(
         self,
@@ -183,7 +195,6 @@ def forward(
 
 
 class MixtralDecoderLayer(nn.Module):
-
     def __init__(
         self,
         config: MixtralConfig,
@@ -203,18 +214,20 @@ def __init__(
             rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            prefix=f"{prefix}.self_attn")
+            prefix=f"{prefix}.self_attn",
+        )
         self.block_sparse_moe = MixtralMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
-            prefix=f"{prefix}.block_sparse_moe")
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+            prefix=f"{prefix}.block_sparse_moe",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
 
     def forward(
         self,
@@ -229,8 +242,7 @@ def forward(
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -239,14 +251,12 @@ def forward(
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.block_sparse_moe(hidden_states)
         return hidden_states, residual
 
 
 class MixtralModel(nn.Module):
-
     def __init__(
         self,
         config: MixtralConfig,
@@ -257,8 +267,11 @@ def __init__(
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
 
@@ -273,7 +286,8 @@ def __init__(
             lambda prefix: MixtralDecoderLayer(
                 config, cache_config, quant_config=quant_config, prefix=prefix
             ),
-            prefix=f"{prefix}.layers")
+            prefix=f"{prefix}.layers",
+        )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -294,14 +308,17 @@ def forward(
             residual = intermediate_tensors["residual"]
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+                residual,
+            )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -341,12 +358,10 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
-        self.quant_config = quant_config
-        self.model = MixtralModel(config,
-                                  cache_config,
-                                  quant_config,
-                                  lora_config=lora_config,
-                                  prefix="model")
+
+        self.model = MixtralModel(
+            config, cache_config, quant_config, lora_config=lora_config, prefix="model"
+        )
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -360,8 +375,9 @@ def __init__(
             if not lora_config else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
         )
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, config.vocab_size
+        )
         self.sampler = Sampler()
 
     def forward(
@@ -372,29 +388,30 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+        hidden_states = self.model(
+            input_ids, positions, kv_caches, attn_metadata, intermediate_tensors
+        )
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
+    def compute_logits(
+        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
+    ) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
         return logits
 
     def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
 
     def sample(
         self,
@@ -412,11 +429,137 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id, is_quantized=True)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        weight_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        is_quantized=True,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+class QuantizedMixtralForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        # TODO check runs with dtype=float16
+        self.use_fused_moe = config.torch_dtype != torch.float8_e4m3fn
+
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MixtralModel(
+            config, self.use_fused_moe, cache_config, quant_config
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, quant_config=quant_config
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata)
+        return hidden_states
+
+    def compute_logits(
+        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
+    ) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -432,44 +575,54 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
 
-                if ("block_sparse_moe.experts." in name
-                        and ".w1." not in name and ".w2." not in name
+                if self.use_fused_moe:
+                    if (
+                        "block_sparse_moe.experts." in name
+                        and ".w1." not in name
+                        and ".w2." not in name
                         and ".w3." not in name
-                        and name not in params_dict):
-                    continue
+                        and name not in params_dict
+                    ):
+                        continue
+
+                    if ".qzeros" in name:
+                        continue
+
+                    shard_id = None
+                    expert_id = 0
+
+                    has_any_numbered = (
+                        ".qweight" in name or ".scales" in name or ".g_idx" in name
+                    )
+                    if has_any_numbered and (".w1." in name):
+                        name = name.replace(".w1.", ".w13_")
+                        shard_id = 0
+                    if has_any_numbered and (".w2." in name):
+                        name = name.replace(".w2.", ".w2_")
+                        shard_id = 0
+                    if has_any_numbered and (".w3." in name):
+                        name = name.replace(".w3.", ".w13_")
+                        shard_id = 1
+
+                    exp_string = re.search(r"\.experts\.\d+.", name)
+                    if exp_string:
+                        exp_string = exp_string.group(0)
+                        expert_id = int(exp_string.split(".")[2])
+                        name = name.replace(exp_string, ".experts.")
 
-                if (".qzeros" in name):
-                    continue
-
-                shard_id = None
-                expert_id = 0
-
-                has_any_numbered = (".qweight" in name or ".scales" in name
-                                    or ".g_idx" in name)
-                if (has_any_numbered and (".w1." in name)):
-                    name = name.replace(".w1.", ".w13_")
-                    shard_id = 0
-                if (has_any_numbered and (".w2." in name)):
-                    name = name.replace(".w2.", ".w2_")
-                    shard_id = 0
-                if (has_any_numbered and (".w3." in name)):
-                    name = name.replace(".w3.", ".w13_")
-                    shard_id = 1
-
-                exp_string = re.search(r"\.experts\.\d+.", name)
-                if exp_string:
-                    exp_string = exp_string.group(0)
-                    expert_id = int(exp_string.split(".")[2])
-                    name = name.replace(exp_string, ".experts.")
+                else:
+                    if "block_sparse_moe.experts." in name and name not in params_dict:
+                        continue
 
                 param = params_dict[name]
 
-                if shard_id is not None:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight, name, shard_id,
-                                  expert_id, True)
+                if self.use_fused_moe and shard_id is not None:
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, name, shard_id, expert_id, True)
                 else:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)

From b863981837d61a6d465e28c0c46e2304cb47333b Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 10:39:27 -0700
Subject: [PATCH 064/106] Removing lora from mixtral model init

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 05bbabe3e9278..652ad21d12ab4 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -514,7 +514,7 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.model = MixtralModel(
-            config, self.use_fused_moe, cache_config, quant_config
+            config, cache_config, quant_config, None, prefix="model"
         )
         self.lm_head = ParallelLMHead(
             config.vocab_size, config.hidden_size, quant_config=quant_config

From 5556d284abbaa7faacd627ea32b433ad5b702f5b Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 10:41:14 -0700
Subject: [PATCH 065/106] Adding empty intermediate tensors

---
 vllm/model_executor/models/mixtral.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 652ad21d12ab4..fa74ec94b9644 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -530,7 +530,7 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata)
+        hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(
@@ -539,6 +539,20 @@ def compute_logits(
         logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
         return logits
 
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
     def sample(
         self,
         logits: Optional[torch.Tensor],

From c484a3766a79bcbd0f7ed2e5e2f63efde620f0c0 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 11:13:10 -0700
Subject: [PATCH 066/106] Building quantMixtralModel

---
 vllm/model_executor/models/mixtral.py | 76 +++++++++++++++++++++------
 1 file changed, 60 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index fa74ec94b9644..43fadf22396b5 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -323,6 +323,50 @@ def forward(
         return hidden_states
 
 
+class QuantMixtralModel(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        use_fused_moe: bool,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                MixtralDecoderLayer(
+                    config, use_fused_moe, cache_config, quant_config=quant_config
+                )
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, kv_caches[i], attn_metadata, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
 class MixtralForCausalLM(nn.Module, SupportsLoRA):
     fall_back_to_pt_during_load = False
 
@@ -513,8 +557,8 @@ def __init__(
 
         self.config = config
         self.quant_config = quant_config
-        self.model = MixtralModel(
-            config, cache_config, quant_config, None, prefix="model"
+        self.model = QuantMixtralModel(
+            config, self.use_fused_moe, cache_config, quant_config
         )
         self.lm_head = ParallelLMHead(
             config.vocab_size, config.hidden_size, quant_config=quant_config
@@ -530,7 +574,7 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors)
+        hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata)
         return hidden_states
 
     def compute_logits(
@@ -539,19 +583,19 @@ def compute_logits(
         logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
         return logits
 
-    def make_empty_intermediate_tensors(
-        self, batch_size: int, dtype: torch.dtype, device: torch.device
-    ) -> IntermediateTensors:
-        return IntermediateTensors(
-            {
-                "hidden_states": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-                "residual": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-            }
-        )
+    # def make_empty_intermediate_tensors(
+    #     self, batch_size: int, dtype: torch.dtype, device: torch.device
+    # ) -> IntermediateTensors:
+    #     return IntermediateTensors(
+    #         {
+    #             "hidden_states": torch.zeros(
+    #                 (batch_size, self.config.hidden_size), dtype=dtype, device=device
+    #             ),
+    #             "residual": torch.zeros(
+    #                 (batch_size, self.config.hidden_size), dtype=dtype, device=device
+    #             ),
+    #         }
+    #     )
 
     def sample(
         self,

From 0344e72750fea4e1916bfbeed57c0db11f51fff8 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 11:22:15 -0700
Subject: [PATCH 067/106] fused moe test

---
 vllm/model_executor/models/mixtral_quant.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 85dafd55bbcf8..bf3f15d072eff 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -51,7 +51,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
-
+import logging
+logger = logging.getLogger(__name__)
 class MixtralMLP(nn.Module):
 
     def __init__(
@@ -371,7 +372,7 @@ def __init__(
 
         # TODO check runs with dtype=float16
         self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
-
+        logger.error(f"Using fused MoE: {self.use_fused_moe}")
         self.config = config
         self.quant_config = quant_config
         self.model = MixtralModel(config, self.use_fused_moe, cache_config,

From 8c8b3fa774e8eed90f23d896afabfe9fb0c81ab7 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 11:26:26 -0700
Subject: [PATCH 068/106] Lora enabled mixtral

---
 vllm/model_executor/models/__init__.py      |   2 +-
 vllm/model_executor/models/mixtral_quant.py | 177 +++++++++++++++++++-
 2 files changed, 176 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 72a13d13eb0d6..2954d8874c9a1 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -48,7 +48,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral", "QuantizedMixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral_quant", "LoRAEnabledMixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index bf3f15d072eff..1801d1b5e0290 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -30,8 +30,9 @@
 from torch import nn
 from transformers import MixtralConfig
 
+from .interfaces import SupportsLoRA
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -46,7 +47,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead, VocabParallelEmbedding, DEFAULT_VOCAB_PADDING_SIZE)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
@@ -484,3 +485,175 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+class LoRAEnabledMixtralForCausalLM(nn.Module, SupportsLoRA):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = MixtralModel(
+            config, cache_config, quant_config, lora_config=lora_config, prefix="model"
+        )
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, config.vocab_size
+        )
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, kv_caches, attn_metadata, intermediate_tensors
+        )
+        return hidden_states
+
+    def compute_logits(
+        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
+    ) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if self.use_fused_moe:
+                    if ("block_sparse_moe.experts." in name
+                            and ".w1." not in name and ".w2." not in name
+                            and ".w3." not in name
+                            and name not in params_dict):
+                        continue
+
+                    if (".qzeros" in name):
+                        continue
+
+                    shard_id = None
+                    expert_id = 0
+
+                    has_any_numbered = (".qweight" in name or ".scales" in name
+                                        or ".g_idx" in name)
+                    if (has_any_numbered and (".w1." in name)):
+                        name = name.replace(".w1.", ".w13_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w2." in name)):
+                        name = name.replace(".w2.", ".w2_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w3." in name)):
+                        name = name.replace(".w3.", ".w13_")
+                        shard_id = 1
+
+                    exp_string = re.search(r"\.experts\.\d+.", name)
+                    if exp_string:
+                        exp_string = exp_string.group(0)
+                        expert_id = int(exp_string.split(".")[2])
+                        name = name.replace(exp_string, ".experts.")
+
+                else:
+                    if ("block_sparse_moe.experts." in name
+                            and name not in params_dict):
+                        continue
+
+                param = params_dict[name]
+
+                if self.use_fused_moe and shard_id is not None:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, name, shard_id,
+                                  expert_id, True)
+                else:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
\ No newline at end of file

From dff59cdbcec1dc975dd1c0809ee4e1929c1e5cf5 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 11:28:38 -0700
Subject: [PATCH 069/106] LoRAMixtralModel compat

---
 vllm/model_executor/models/mixtral_quant.py | 74 ++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 1801d1b5e0290..ae5559d4297d9 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -31,9 +31,11 @@
 from transformers import MixtralConfig
 
 from .interfaces import SupportsLoRA
+from .utils import is_pp_missing_parameter, make_layers
+
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -360,6 +362,74 @@ def forward(
         return hidden_states
 
 
+
+class LoRAMixtralModel(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MixtralDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
 class MixtralForCausalLM(nn.Module):
     fall_back_to_pt_during_load = False
 
@@ -521,7 +591,7 @@ def __init__(
         self.config = config
         self.lora_config = lora_config
 
-        self.model = MixtralModel(
+        self.model = LoRAMixtralModel(
             config, cache_config, quant_config, lora_config=lora_config, prefix="model"
         )
         self.unpadded_vocab_size = config.vocab_size

From 33f7e515a50041ab943f66ee88687c8c2de7d673 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 11:30:17 -0700
Subject: [PATCH 070/106] remove prefix

---
 vllm/model_executor/models/mixtral_quant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index ae5559d4297d9..08049770d5e6b 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -391,7 +391,7 @@ def __init__(
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MixtralDecoderLayer(
-                config, cache_config, quant_config=quant_config, prefix=prefix
+                config, cache_config, quant_config=quant_config
             ),
             prefix=f"{prefix}.layers",
         )

From fdba91766bae9f6d234709f6433aeecfbc07f737 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 11:31:11 -0700
Subject: [PATCH 071/106] use fused moe

---
 vllm/model_executor/models/mixtral_quant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 08049770d5e6b..056ba105531d9 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -590,7 +590,7 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
-
+        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
         self.model = LoRAMixtralModel(
             config, cache_config, quant_config, lora_config=lora_config, prefix="model"
         )

From 780471ebf1263109923fffe134f7c22bf3bc53f7 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 12:25:15 -0700
Subject: [PATCH 072/106] remove org num embeddings

---
 vllm/model_executor/models/mixtral_quant.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 056ba105531d9..3374530086499 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -600,7 +600,6 @@ def __init__(
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE
             # We need bigger padding if using lora for kernel
             # compatibility
@@ -608,7 +607,7 @@ def __init__(
             quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
+            self.unpadded_vocab_size
         )
         self.sampler = Sampler()
 

From c0970f1d30a11eef359f8c7210492e06381215cf Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 12:29:43 -0700
Subject: [PATCH 073/106] pass use fused moe into decoder

---
 vllm/model_executor/models/mixtral_quant.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 3374530086499..c0837802ce318 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -391,7 +391,7 @@ def __init__(
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MixtralDecoderLayer(
-                config, cache_config, quant_config=quant_config
+                config, use_fused_moe=True, cache_config=cache_config, quant_config=quant_config
             ),
             prefix=f"{prefix}.layers",
         )
@@ -592,7 +592,7 @@ def __init__(
         self.lora_config = lora_config
         self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
         self.model = LoRAMixtralModel(
-            config, cache_config, quant_config, lora_config=lora_config, prefix="model"
+            config=config, cache_config=cache_config, quant_config=quant_config, lora_config=lora_config, prefix="model"
         )
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
@@ -600,6 +600,7 @@ def __init__(
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
             config.hidden_size,
+            org_num_embeddings=config.vocab_size,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE
             # We need bigger padding if using lora for kernel
             # compatibility
@@ -607,7 +608,7 @@ def __init__(
             quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size
+            self.unpadded_vocab_size, config.vocab_size
         )
         self.sampler = Sampler()
 

From 6a1a8387346868fe1be6df24cb61648ac49ff6ec Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:15:06 -0700
Subject: [PATCH 074/106] Mixtral for causal lm load func

---
 vllm/model_executor/models/__init__.py |   2 +-
 vllm/model_executor/models/mixtral.py  | 169 ++++++++++++++++++-------
 2 files changed, 123 insertions(+), 48 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 2954d8874c9a1..329df4830af41 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -48,7 +48,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral_quant", "LoRAEnabledMixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 43fadf22396b5..e5f12d2eb8648 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -402,7 +402,7 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
-
+        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
         self.model = MixtralModel(
             config, cache_config, quant_config, lora_config=lora_config, prefix="model"
         )
@@ -465,6 +465,80 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    #     stacked_params_mapping = [
+    #         # (param_name, shard_name, shard_id)
+    #         ("qkv_proj", "q_proj", "q"),
+    #         ("qkv_proj", "k_proj", "k"),
+    #         ("qkv_proj", "v_proj", "v"),
+    #     ]
+
+    #     # Params for weights, fp8 weight scales, fp8 activation scales
+    #     # (param_name, weight_name, expert_id, shard_id)
+    #     expert_params_mapping = FusedMoE.make_expert_params_mapping(
+    #         ckpt_gate_proj_name="w1",
+    #         ckpt_down_proj_name="w2",
+    #         ckpt_up_proj_name="w3",
+    #         num_experts=self.config.num_local_experts,
+    #     )
+
+    #     params_dict = dict(self.named_parameters())
+    #     for name, loaded_weight in weights:
+    #         if "rotary_emb.inv_freq" in name:
+    #             continue
+
+    #         for param_name, weight_name, shard_id in stacked_params_mapping:
+    #             if weight_name not in name:
+    #                 continue
+    #             name = name.replace(weight_name, param_name)
+    #             # Skip loading extra bias for GPTQ models.
+    #             if name.endswith(".bias") and name not in params_dict:
+    #                 continue
+    #             # Skip layers on other devices.
+    #             if is_pp_missing_parameter(name, self):
+    #                 continue
+
+    #             param = params_dict[name]
+    #             weight_loader = param.weight_loader
+    #             weight_loader(param, loaded_weight, shard_id, is_quantized=True)
+    #             break
+    #         else:
+    #             for mapping in expert_params_mapping:
+    #                 param_name, weight_name, expert_id, shard_id = mapping
+    #                 if weight_name not in name:
+    #                     continue
+    #                 name = name.replace(weight_name, param_name)
+    #                 # Skip layers on other devices.
+    #                 if is_pp_missing_parameter(name, self):
+    #                     continue
+    #                 param = params_dict[name]
+    #                 weight_loader = param.weight_loader
+    #                 weight_loader(
+    #                     param,
+    #                     loaded_weight,
+    #                     weight_name,
+    #                     shard_id=shard_id,
+    #                     expert_id=expert_id,
+    #                     is_quantized=True,
+    #                 )
+    #                 break
+    #             else:
+    #                 # Skip loading extra bias for GPTQ models.
+    #                 if name.endswith(".bias") and name not in params_dict:
+    #                     continue
+    #                 # Skip layers on other devices.
+    #                 if is_pp_missing_parameter(name, self):
+    #                     continue
+    #                 # Remapping the name of FP8 kv-scale.
+    #                 name = maybe_remap_kv_scale_name(name, params_dict)
+    #                 if name is None:
+    #                     continue
+
+    #                 param = params_dict[name]
+    #                 weight_loader = getattr(
+    #                     param, "weight_loader", default_weight_loader
+    #                 )
+    #                 weight_loader(param, loaded_weight)
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -473,71 +547,72 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts,
-        )
-
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id, is_quantized=True)
+                weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if self.use_fused_moe:
+                    if ("block_sparse_moe.experts." in name
+                            and ".w1." not in name and ".w2." not in name
+                            and ".w3." not in name
+                            and name not in params_dict):
                         continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+
+                    if (".qzeros" in name):
                         continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        weight_name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
-                        is_quantized=True,
-                    )
-                    break
+
+                    shard_id = None
+                    expert_id = 0
+
+                    has_any_numbered = (".qweight" in name or ".scales" in name
+                                        or ".g_idx" in name)
+                    if (has_any_numbered and (".w1." in name)):
+                        name = name.replace(".w1.", ".w13_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w2." in name)):
+                        name = name.replace(".w2.", ".w2_")
+                        shard_id = 0
+                    if (has_any_numbered and (".w3." in name)):
+                        name = name.replace(".w3.", ".w13_")
+                        shard_id = 1
+
+                    exp_string = re.search(r"\.experts\.\d+.", name)
+                    if exp_string:
+                        exp_string = exp_string.group(0)
+                        expert_id = int(exp_string.split(".")[2])
+                        name = name.replace(exp_string, ".experts.")
+
                 else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
+                    if ("block_sparse_moe.experts." in name
+                            and name not in params_dict):
                         continue
 
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
+                param = params_dict[name]
+
+                if self.use_fused_moe and shard_id is not None:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, name, shard_id,
+                                  expert_id, True)
+                else:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
                     weight_loader(param, loaded_weight)
 
 

From 5c3e857163c53e0c1985bec070233b8420a9a407 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:27:51 -0700
Subject: [PATCH 075/106] Copying over quant mixtral

---
 vllm/model_executor/models/mixtral.py | 383 +++++++++++++++++---------
 1 file changed, 256 insertions(+), 127 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e5f12d2eb8648..8f577a0beb420 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -24,12 +24,14 @@
 from typing import Iterable, List, Optional, Tuple
 import re
 import torch
+import numpy as np
+import torch.nn.functional as F
 from torch import nn
 from transformers import MixtralConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank, tensor_model_parallel_all_reduce
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -112,6 +114,130 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape)
 
+class MixtralMLP(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.num_experts = num_experts
+        self.ffn_dim = intermediate_size
+        self.hidden_dim = hidden_size
+
+        self.w1 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+        self.w2 = ReplicatedLinear(self.ffn_dim,
+                                   self.hidden_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+        self.w3 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+
+        # TODO: Use vllm's SiluAndMul
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        w1_out, _ = self.w1(hidden_states)
+        w1_out = self.act_fn(w1_out)
+        w3_out, _ = self.w3(hidden_states)
+        current_hidden_states = w1_out * w3_out
+        current_hidden_states, _ = self.w2(current_hidden_states)
+        return current_hidden_states
+class QuantMixtralMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        use_fused_moe: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.use_fused_moe = use_fused_moe
+        self.quant_config = quant_config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.num_total_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.num_total_experts}.")
+        # Split experts equally between ranks
+        self.expert_indicies = np.array_split(range(
+            self.num_total_experts), self.tp_size)[self.rank].tolist()
+        if not self.expert_indicies:
+            raise ValueError(
+                f"Rank {self.rank} has no experts assigned to it.")
+
+        if self.use_fused_moe:
+            params_dtype = torch.float16
+            self.experts = FusedMoE(num_experts=self.num_total_experts,
+                                    top_k=self.top_k,
+                                    hidden_size=config.hidden_size,
+                                    intermediate_size=config.intermediate_size,
+                                    params_dtype=params_dtype,
+                                    reduce_results=True,
+                                    renormalize=True,
+                                    quant_config=quant_config,
+                                    tp_size=self.tp_size)
+        else:
+            self.experts = nn.ModuleList([
+                MixtralMLP(self.num_total_experts,
+                           config.hidden_size,
+                           config.intermediate_size,
+                           quant_config=quant_config)
+                if idx in self.expert_indicies else None
+                for idx in range(self.num_total_experts)
+            ])
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     self.num_total_experts,
+                                     bias=False,
+                                     quant_config=None)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        router_logits, _ = self.gate(hidden_states)
+
+        if self.use_fused_moe:
+            ret = self.experts(hidden_states.half(), router_logits)
+            return ret.bfloat16()
+        else:
+            routing_weights = F.softmax(router_logits,
+                                        dim=1,
+                                        dtype=torch.float)
+            routing_weights, selected_experts = torch.topk(routing_weights,
+                                                           self.top_k,
+                                                           dim=-1)
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+            final_hidden_states = None
+            for expert_idx in self.expert_indicies:
+                expert_layer = self.experts[expert_idx]
+                expert_mask = (selected_experts == expert_idx)
+                expert_weights = (routing_weights * expert_mask).sum(
+                    dim=-1, keepdim=True)
+
+                current_hidden_states = expert_layer(hidden_states).mul_(
+                    expert_weights)
+                if final_hidden_states is None:
+                    final_hidden_states = current_hidden_states
+                else:
+                    final_hidden_states.add_(current_hidden_states)
+
+            return tensor_model_parallel_all_reduce(final_hidden_states).view(
+                num_tokens, hidden_dim)
+
 
 class MixtralAttention(nn.Module):
     def __init__(
@@ -216,13 +342,16 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
-        self.block_sparse_moe = MixtralMoE(
-            num_experts=config.num_local_experts,
-            top_k=config.num_experts_per_tok,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            quant_config=quant_config,
-            prefix=f"{prefix}.block_sparse_moe",
+        # self.block_sparse_moe = MixtralMoE(
+        #     num_experts=config.num_local_experts,
+        #     top_k=config.num_experts_per_tok,
+        #     hidden_size=config.hidden_size,
+        #     intermediate_size=config.intermediate_size,
+        #     quant_config=quant_config,
+        #     prefix=f"{prefix}.block_sparse_moe",
+        # )
+        self.block_sparse_moe = QuantMixtralMoE(
+            config, use_fused_moe=True, quant_config=quant_config
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
@@ -465,80 +594,6 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-    #     stacked_params_mapping = [
-    #         # (param_name, shard_name, shard_id)
-    #         ("qkv_proj", "q_proj", "q"),
-    #         ("qkv_proj", "k_proj", "k"),
-    #         ("qkv_proj", "v_proj", "v"),
-    #     ]
-
-    #     # Params for weights, fp8 weight scales, fp8 activation scales
-    #     # (param_name, weight_name, expert_id, shard_id)
-    #     expert_params_mapping = FusedMoE.make_expert_params_mapping(
-    #         ckpt_gate_proj_name="w1",
-    #         ckpt_down_proj_name="w2",
-    #         ckpt_up_proj_name="w3",
-    #         num_experts=self.config.num_local_experts,
-    #     )
-
-    #     params_dict = dict(self.named_parameters())
-    #     for name, loaded_weight in weights:
-    #         if "rotary_emb.inv_freq" in name:
-    #             continue
-
-    #         for param_name, weight_name, shard_id in stacked_params_mapping:
-    #             if weight_name not in name:
-    #                 continue
-    #             name = name.replace(weight_name, param_name)
-    #             # Skip loading extra bias for GPTQ models.
-    #             if name.endswith(".bias") and name not in params_dict:
-    #                 continue
-    #             # Skip layers on other devices.
-    #             if is_pp_missing_parameter(name, self):
-    #                 continue
-
-    #             param = params_dict[name]
-    #             weight_loader = param.weight_loader
-    #             weight_loader(param, loaded_weight, shard_id, is_quantized=True)
-    #             break
-    #         else:
-    #             for mapping in expert_params_mapping:
-    #                 param_name, weight_name, expert_id, shard_id = mapping
-    #                 if weight_name not in name:
-    #                     continue
-    #                 name = name.replace(weight_name, param_name)
-    #                 # Skip layers on other devices.
-    #                 if is_pp_missing_parameter(name, self):
-    #                     continue
-    #                 param = params_dict[name]
-    #                 weight_loader = param.weight_loader
-    #                 weight_loader(
-    #                     param,
-    #                     loaded_weight,
-    #                     weight_name,
-    #                     shard_id=shard_id,
-    #                     expert_id=expert_id,
-    #                     is_quantized=True,
-    #                 )
-    #                 break
-    #             else:
-    #                 # Skip loading extra bias for GPTQ models.
-    #                 if name.endswith(".bias") and name not in params_dict:
-    #                     continue
-    #                 # Skip layers on other devices.
-    #                 if is_pp_missing_parameter(name, self):
-    #                     continue
-    #                 # Remapping the name of FP8 kv-scale.
-    #                 name = maybe_remap_kv_scale_name(name, params_dict)
-    #                 if name is None:
-    #                     continue
-
-    #                 param = params_dict[name]
-    #                 weight_loader = getattr(
-    #                     param, "weight_loader", default_weight_loader
-    #                 )
-    #                 weight_loader(param, loaded_weight)
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -547,73 +602,147 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader(param, loaded_weight, shard_id, is_quantized=True)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if self.use_fused_moe:
-                    if ("block_sparse_moe.experts." in name
-                            and ".w1." not in name and ".w2." not in name
-                            and ".w3." not in name
-                            and name not in params_dict):
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
                         continue
-
-                    if (".qzeros" in name):
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        weight_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        is_quantized=True,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
                         continue
 
-                    shard_id = None
-                    expert_id = 0
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    #     stacked_params_mapping = [
+    #         # (param_name, shard_name, shard_id)
+    #         ("qkv_proj", "q_proj", "q"),
+    #         ("qkv_proj", "k_proj", "k"),
+    #         ("qkv_proj", "v_proj", "v"),
+    #     ]
 
-                    has_any_numbered = (".qweight" in name or ".scales" in name
-                                        or ".g_idx" in name)
-                    if (has_any_numbered and (".w1." in name)):
-                        name = name.replace(".w1.", ".w13_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w2." in name)):
-                        name = name.replace(".w2.", ".w2_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w3." in name)):
-                        name = name.replace(".w3.", ".w13_")
-                        shard_id = 1
+    #     params_dict = dict(self.named_parameters())
+    #     for name, loaded_weight in weights:
+    #         if "rotary_emb.inv_freq" in name:
+    #             continue
+    #         for (param_name, weight_name, shard_id) in stacked_params_mapping:
+    #             if weight_name not in name:
+    #                 continue
+    #             name = name.replace(weight_name, param_name)
+    #             # Skip loading extra bias for GPTQ models.
+    #             if name.endswith(".bias") and name not in params_dict:
+    #                 continue
+    #             param = params_dict[name]
+    #             weight_loader = param.weight_loader
+    #             weight_loader(param, loaded_weight, shard_id)
+    #             break
+    #         else:
+    #             # Skip loading extra bias for GPTQ models.
+    #             if name.endswith(".bias") and name not in params_dict:
+    #                 continue
 
-                    exp_string = re.search(r"\.experts\.\d+.", name)
-                    if exp_string:
-                        exp_string = exp_string.group(0)
-                        expert_id = int(exp_string.split(".")[2])
-                        name = name.replace(exp_string, ".experts.")
+    #             if self.use_fused_moe:
+    #                 if ("block_sparse_moe.experts." in name
+    #                         and ".w1." not in name and ".w2." not in name
+    #                         and ".w3." not in name
+    #                         and name not in params_dict):
+    #                     continue
 
-                else:
-                    if ("block_sparse_moe.experts." in name
-                            and name not in params_dict):
-                        continue
+    #                 if (".qzeros" in name):
+    #                     continue
 
-                param = params_dict[name]
+    #                 shard_id = None
+    #                 expert_id = 0
+
+    #                 has_any_numbered = (".qweight" in name or ".scales" in name
+    #                                     or ".g_idx" in name)
+    #                 if (has_any_numbered and (".w1." in name)):
+    #                     name = name.replace(".w1.", ".w13_")
+    #                     shard_id = 0
+    #                 if (has_any_numbered and (".w2." in name)):
+    #                     name = name.replace(".w2.", ".w2_")
+    #                     shard_id = 0
+    #                 if (has_any_numbered and (".w3." in name)):
+    #                     name = name.replace(".w3.", ".w13_")
+    #                     shard_id = 1
+
+    #                 exp_string = re.search(r"\.experts\.\d+.", name)
+    #                 if exp_string:
+    #                     exp_string = exp_string.group(0)
+    #                     expert_id = int(exp_string.split(".")[2])
+    #                     name = name.replace(exp_string, ".experts.")
 
-                if self.use_fused_moe and shard_id is not None:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight, name, shard_id,
-                                  expert_id, True)
-                else:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
+    #             else:
+    #                 if ("block_sparse_moe.experts." in name
+    #                         and name not in params_dict):
+    #                     continue
+
+    #             param = params_dict[name]
+
+    #             if self.use_fused_moe and shard_id is not None:
+    #                 weight_loader = getattr(param, "weight_loader",
+    #                                         default_weight_loader)
+    #                 weight_loader(param, loaded_weight, name, shard_id,
+    #                               expert_id, True)
+    #             else:
+    #                 weight_loader = getattr(param, "weight_loader",
+    #                                         default_weight_loader)
+    #                 weight_loader(param, loaded_weight)
 
 
 class QuantizedMixtralForCausalLM(nn.Module):

From 8d327ded68e6eb992b083af003433cc739638ae3 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:31:26 -0700
Subject: [PATCH 076/106] Passing prefix

---
 vllm/model_executor/models/mixtral.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 8f577a0beb420..983a74be14513 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -158,6 +158,7 @@ def __init__(
         config: MixtralConfig,
         use_fused_moe: bool,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -188,7 +189,8 @@ def __init__(
                                     reduce_results=True,
                                     renormalize=True,
                                     quant_config=quant_config,
-                                    tp_size=self.tp_size)
+                                    tp_size=self.tp_size,
+                                    prefix=f"{prefix}.experts")
         else:
             self.experts = nn.ModuleList([
                 MixtralMLP(self.num_total_experts,
@@ -202,7 +204,8 @@ def __init__(
         self.gate = ReplicatedLinear(config.hidden_size,
                                      self.num_total_experts,
                                      bias=False,
-                                     quant_config=None)
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
@@ -351,7 +354,7 @@ def __init__(
         #     prefix=f"{prefix}.block_sparse_moe",
         # )
         self.block_sparse_moe = QuantMixtralMoE(
-            config, use_fused_moe=True, quant_config=quant_config
+            config, use_fused_moe=True, quant_config=quant_config, prefix=f"{prefix}.block_sparse_moe",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(

From d337aeab98b5968d6c35cea6d004caa83d1d27cf Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:32:33 -0700
Subject: [PATCH 077/106] Weight load

---
 vllm/model_executor/models/mixtral.py | 365 ++++++++++++++------------
 1 file changed, 192 insertions(+), 173 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 983a74be14513..a656782307d47 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -31,7 +31,12 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank, tensor_model_parallel_all_reduce
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_rank,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
@@ -114,8 +119,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape)
 
-class MixtralMLP(nn.Module):
 
+class MixtralMLP(nn.Module):
     def __init__(
         self,
         num_experts: int,
@@ -128,18 +133,15 @@ def __init__(
         self.ffn_dim = intermediate_size
         self.hidden_dim = hidden_size
 
-        self.w1 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w2 = ReplicatedLinear(self.ffn_dim,
-                                   self.hidden_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w3 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
+        self.w1 = ReplicatedLinear(
+            self.hidden_dim, self.ffn_dim, bias=False, quant_config=quant_config
+        )
+        self.w2 = ReplicatedLinear(
+            self.ffn_dim, self.hidden_dim, bias=False, quant_config=quant_config
+        )
+        self.w3 = ReplicatedLinear(
+            self.hidden_dim, self.ffn_dim, bias=False, quant_config=quant_config
+        )
 
         # TODO: Use vllm's SiluAndMul
         self.act_fn = nn.SiLU()
@@ -151,8 +153,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         current_hidden_states = w1_out * w3_out
         current_hidden_states, _ = self.w2(current_hidden_states)
         return current_hidden_states
-class QuantMixtralMoE(nn.Module):
 
+
+class QuantMixtralMoE(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
@@ -171,41 +174,51 @@ def __init__(
         if self.tp_size > self.num_total_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.num_total_experts}.")
+                f"the number of experts {self.num_total_experts}."
+            )
         # Split experts equally between ranks
-        self.expert_indicies = np.array_split(range(
-            self.num_total_experts), self.tp_size)[self.rank].tolist()
+        self.expert_indicies = np.array_split(
+            range(self.num_total_experts), self.tp_size
+        )[self.rank].tolist()
         if not self.expert_indicies:
-            raise ValueError(
-                f"Rank {self.rank} has no experts assigned to it.")
+            raise ValueError(f"Rank {self.rank} has no experts assigned to it.")
 
         if self.use_fused_moe:
             params_dtype = torch.float16
-            self.experts = FusedMoE(num_experts=self.num_total_experts,
-                                    top_k=self.top_k,
-                                    hidden_size=config.hidden_size,
-                                    intermediate_size=config.intermediate_size,
-                                    params_dtype=params_dtype,
-                                    reduce_results=True,
-                                    renormalize=True,
-                                    quant_config=quant_config,
-                                    tp_size=self.tp_size,
-                                    prefix=f"{prefix}.experts")
+            self.experts = FusedMoE(
+                num_experts=self.num_total_experts,
+                top_k=self.top_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                params_dtype=params_dtype,
+                reduce_results=True,
+                renormalize=True,
+                quant_config=quant_config,
+                tp_size=self.tp_size,
+                prefix=f"{prefix}.experts",
+            )
         else:
-            self.experts = nn.ModuleList([
-                MixtralMLP(self.num_total_experts,
-                           config.hidden_size,
-                           config.intermediate_size,
-                           quant_config=quant_config)
-                if idx in self.expert_indicies else None
-                for idx in range(self.num_total_experts)
-            ])
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     self.num_total_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
+            self.experts = nn.ModuleList(
+                [
+                    MixtralMLP(
+                        self.num_total_experts,
+                        config.hidden_size,
+                        config.intermediate_size,
+                        quant_config=quant_config,
+                    )
+                    if idx in self.expert_indicies
+                    else None
+                    for idx in range(self.num_total_experts)
+                ]
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
@@ -216,30 +229,29 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             ret = self.experts(hidden_states.half(), router_logits)
             return ret.bfloat16()
         else:
-            routing_weights = F.softmax(router_logits,
-                                        dim=1,
-                                        dtype=torch.float)
-            routing_weights, selected_experts = torch.topk(routing_weights,
-                                                           self.top_k,
-                                                           dim=-1)
+            routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+            routing_weights, selected_experts = torch.topk(
+                routing_weights, self.top_k, dim=-1
+            )
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
 
             final_hidden_states = None
             for expert_idx in self.expert_indicies:
                 expert_layer = self.experts[expert_idx]
-                expert_mask = (selected_experts == expert_idx)
+                expert_mask = selected_experts == expert_idx
                 expert_weights = (routing_weights * expert_mask).sum(
-                    dim=-1, keepdim=True)
+                    dim=-1, keepdim=True
+                )
 
-                current_hidden_states = expert_layer(hidden_states).mul_(
-                    expert_weights)
+                current_hidden_states = expert_layer(hidden_states).mul_(expert_weights)
                 if final_hidden_states is None:
                     final_hidden_states = current_hidden_states
                 else:
                     final_hidden_states.add_(current_hidden_states)
 
             return tensor_model_parallel_all_reduce(final_hidden_states).view(
-                num_tokens, hidden_dim)
+                num_tokens, hidden_dim
+            )
 
 
 class MixtralAttention(nn.Module):
@@ -354,7 +366,10 @@ def __init__(
         #     prefix=f"{prefix}.block_sparse_moe",
         # )
         self.block_sparse_moe = QuantMixtralMoE(
-            config, use_fused_moe=True, quant_config=quant_config, prefix=f"{prefix}.block_sparse_moe",
+            config,
+            use_fused_moe=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
@@ -534,7 +549,7 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
-        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
+        self.use_fused_moe = config.torch_dtype != torch.float8_e4m3fn
         self.model = MixtralModel(
             config, cache_config, quant_config, lora_config=lora_config, prefix="model"
         )
@@ -597,6 +612,80 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    #     stacked_params_mapping = [
+    #         # (param_name, shard_name, shard_id)
+    #         ("qkv_proj", "q_proj", "q"),
+    #         ("qkv_proj", "k_proj", "k"),
+    #         ("qkv_proj", "v_proj", "v"),
+    #     ]
+
+    #     # Params for weights, fp8 weight scales, fp8 activation scales
+    #     # (param_name, weight_name, expert_id, shard_id)
+    #     expert_params_mapping = FusedMoE.make_expert_params_mapping(
+    #         ckpt_gate_proj_name="w1",
+    #         ckpt_down_proj_name="w2",
+    #         ckpt_up_proj_name="w3",
+    #         num_experts=self.config.num_local_experts,
+    #     )
+
+    #     params_dict = dict(self.named_parameters())
+    #     for name, loaded_weight in weights:
+    #         if "rotary_emb.inv_freq" in name:
+    #             continue
+
+    #         for param_name, weight_name, shard_id in stacked_params_mapping:
+    #             if weight_name not in name:
+    #                 continue
+    #             name = name.replace(weight_name, param_name)
+    #             # Skip loading extra bias for GPTQ models.
+    #             if name.endswith(".bias") and name not in params_dict:
+    #                 continue
+    #             # Skip layers on other devices.
+    #             if is_pp_missing_parameter(name, self):
+    #                 continue
+
+    #             param = params_dict[name]
+    #             weight_loader = param.weight_loader
+    #             weight_loader(param, loaded_weight, shard_id, is_quantized=True)
+    #             break
+    #         else:
+    #             for mapping in expert_params_mapping:
+    #                 param_name, weight_name, expert_id, shard_id = mapping
+    #                 if weight_name not in name:
+    #                     continue
+    #                 name = name.replace(weight_name, param_name)
+    #                 # Skip layers on other devices.
+    #                 if is_pp_missing_parameter(name, self):
+    #                     continue
+    #                 param = params_dict[name]
+    #                 weight_loader = param.weight_loader
+    #                 weight_loader(
+    #                     param,
+    #                     loaded_weight,
+    #                     weight_name,
+    #                     shard_id=shard_id,
+    #                     expert_id=expert_id,
+    #                     is_quantized=True,
+    #                 )
+    #                 break
+    #             else:
+    #                 # Skip loading extra bias for GPTQ models.
+    #                 if name.endswith(".bias") and name not in params_dict:
+    #                     continue
+    #                 # Skip layers on other devices.
+    #                 if is_pp_missing_parameter(name, self):
+    #                     continue
+    #                 # Remapping the name of FP8 kv-scale.
+    #                 name = maybe_remap_kv_scale_name(name, params_dict)
+    #                 if name is None:
+    #                     continue
+
+    #                 param = params_dict[name]
+    #                 weight_loader = getattr(
+    #                     param, "weight_loader", default_weight_loader
+    #                 )
+    #                 weight_loader(param, loaded_weight)
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -605,20 +694,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts,
-        )
-
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -626,126 +705,66 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id, is_quantized=True)
+                weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if self.use_fused_moe:
+                    if (
+                        "block_sparse_moe.experts." in name
+                        and ".w1." not in name
+                        and ".w2." not in name
+                        and ".w3." not in name
+                        and name not in params_dict
+                    ):
                         continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+
+                    if ".qzeros" in name:
                         continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        weight_name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
-                        is_quantized=True,
+
+                    shard_id = None
+                    expert_id = 0
+
+                    has_any_numbered = (
+                        ".qweight" in name or ".scales" in name or ".g_idx" in name
                     )
-                    break
+                    if has_any_numbered and (".w1." in name):
+                        name = name.replace(".w1.", ".w13_")
+                        shard_id = 0
+                    if has_any_numbered and (".w2." in name):
+                        name = name.replace(".w2.", ".w2_")
+                        shard_id = 0
+                    if has_any_numbered and (".w3." in name):
+                        name = name.replace(".w3.", ".w13_")
+                        shard_id = 1
+
+                    exp_string = re.search(r"\.experts\.\d+.", name)
+                    if exp_string:
+                        exp_string = exp_string.group(0)
+                        expert_id = int(exp_string.split(".")[2])
+                        name = name.replace(exp_string, ".experts.")
+
                 else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
+                    if "block_sparse_moe.experts." in name and name not in params_dict:
                         continue
 
-                    param = params_dict[name]
+                param = params_dict[name]
+
+                if self.use_fused_moe and shard_id is not None:
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, name, shard_id, expert_id, True)
+                else:
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader
                     )
                     weight_loader(param, loaded_weight)
-    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-    #     stacked_params_mapping = [
-    #         # (param_name, shard_name, shard_id)
-    #         ("qkv_proj", "q_proj", "q"),
-    #         ("qkv_proj", "k_proj", "k"),
-    #         ("qkv_proj", "v_proj", "v"),
-    #     ]
-
-    #     params_dict = dict(self.named_parameters())
-    #     for name, loaded_weight in weights:
-    #         if "rotary_emb.inv_freq" in name:
-    #             continue
-    #         for (param_name, weight_name, shard_id) in stacked_params_mapping:
-    #             if weight_name not in name:
-    #                 continue
-    #             name = name.replace(weight_name, param_name)
-    #             # Skip loading extra bias for GPTQ models.
-    #             if name.endswith(".bias") and name not in params_dict:
-    #                 continue
-    #             param = params_dict[name]
-    #             weight_loader = param.weight_loader
-    #             weight_loader(param, loaded_weight, shard_id)
-    #             break
-    #         else:
-    #             # Skip loading extra bias for GPTQ models.
-    #             if name.endswith(".bias") and name not in params_dict:
-    #                 continue
-
-    #             if self.use_fused_moe:
-    #                 if ("block_sparse_moe.experts." in name
-    #                         and ".w1." not in name and ".w2." not in name
-    #                         and ".w3." not in name
-    #                         and name not in params_dict):
-    #                     continue
-
-    #                 if (".qzeros" in name):
-    #                     continue
-
-    #                 shard_id = None
-    #                 expert_id = 0
-
-    #                 has_any_numbered = (".qweight" in name or ".scales" in name
-    #                                     or ".g_idx" in name)
-    #                 if (has_any_numbered and (".w1." in name)):
-    #                     name = name.replace(".w1.", ".w13_")
-    #                     shard_id = 0
-    #                 if (has_any_numbered and (".w2." in name)):
-    #                     name = name.replace(".w2.", ".w2_")
-    #                     shard_id = 0
-    #                 if (has_any_numbered and (".w3." in name)):
-    #                     name = name.replace(".w3.", ".w13_")
-    #                     shard_id = 1
-
-    #                 exp_string = re.search(r"\.experts\.\d+.", name)
-    #                 if exp_string:
-    #                     exp_string = exp_string.group(0)
-    #                     expert_id = int(exp_string.split(".")[2])
-    #                     name = name.replace(exp_string, ".experts.")
-
-    #             else:
-    #                 if ("block_sparse_moe.experts." in name
-    #                         and name not in params_dict):
-    #                     continue
-
-    #             param = params_dict[name]
-
-    #             if self.use_fused_moe and shard_id is not None:
-    #                 weight_loader = getattr(param, "weight_loader",
-    #                                         default_weight_loader)
-    #                 weight_loader(param, loaded_weight, name, shard_id,
-    #                               expert_id, True)
-    #             else:
-    #                 weight_loader = getattr(param, "weight_loader",
-    #                                         default_weight_loader)
-    #                 weight_loader(param, loaded_weight)
 
 
 class QuantizedMixtralForCausalLM(nn.Module):

From 379f3e82cb4f1c5a60f2ae6a229a07e41a6c4aeb Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:40:08 -0700
Subject: [PATCH 078/106] Weight load back

---
 vllm/model_executor/models/mixtral.py | 414 +++++++-------------------
 1 file changed, 112 insertions(+), 302 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a656782307d47..64572444ee0f7 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -469,51 +469,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class QuantMixtralModel(nn.Module):
-    def __init__(
-        self,
-        config: MixtralConfig,
-        use_fused_moe: bool,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList(
-            [
-                MixtralDecoderLayer(
-                    config, use_fused_moe, cache_config, quant_config=quant_config
-                )
-                for _ in range(config.num_hidden_layers)
-            ]
-        )
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions, hidden_states, kv_caches[i], attn_metadata, residual
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
 class MixtralForCausalLM(nn.Module, SupportsLoRA):
     fall_back_to_pt_during_load = False
 
@@ -612,80 +567,6 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-    #     stacked_params_mapping = [
-    #         # (param_name, shard_name, shard_id)
-    #         ("qkv_proj", "q_proj", "q"),
-    #         ("qkv_proj", "k_proj", "k"),
-    #         ("qkv_proj", "v_proj", "v"),
-    #     ]
-
-    #     # Params for weights, fp8 weight scales, fp8 activation scales
-    #     # (param_name, weight_name, expert_id, shard_id)
-    #     expert_params_mapping = FusedMoE.make_expert_params_mapping(
-    #         ckpt_gate_proj_name="w1",
-    #         ckpt_down_proj_name="w2",
-    #         ckpt_up_proj_name="w3",
-    #         num_experts=self.config.num_local_experts,
-    #     )
-
-    #     params_dict = dict(self.named_parameters())
-    #     for name, loaded_weight in weights:
-    #         if "rotary_emb.inv_freq" in name:
-    #             continue
-
-    #         for param_name, weight_name, shard_id in stacked_params_mapping:
-    #             if weight_name not in name:
-    #                 continue
-    #             name = name.replace(weight_name, param_name)
-    #             # Skip loading extra bias for GPTQ models.
-    #             if name.endswith(".bias") and name not in params_dict:
-    #                 continue
-    #             # Skip layers on other devices.
-    #             if is_pp_missing_parameter(name, self):
-    #                 continue
-
-    #             param = params_dict[name]
-    #             weight_loader = param.weight_loader
-    #             weight_loader(param, loaded_weight, shard_id, is_quantized=True)
-    #             break
-    #         else:
-    #             for mapping in expert_params_mapping:
-    #                 param_name, weight_name, expert_id, shard_id = mapping
-    #                 if weight_name not in name:
-    #                     continue
-    #                 name = name.replace(weight_name, param_name)
-    #                 # Skip layers on other devices.
-    #                 if is_pp_missing_parameter(name, self):
-    #                     continue
-    #                 param = params_dict[name]
-    #                 weight_loader = param.weight_loader
-    #                 weight_loader(
-    #                     param,
-    #                     loaded_weight,
-    #                     weight_name,
-    #                     shard_id=shard_id,
-    #                     expert_id=expert_id,
-    #                     is_quantized=True,
-    #                 )
-    #                 break
-    #             else:
-    #                 # Skip loading extra bias for GPTQ models.
-    #                 if name.endswith(".bias") and name not in params_dict:
-    #                     continue
-    #                 # Skip layers on other devices.
-    #                 if is_pp_missing_parameter(name, self):
-    #                     continue
-    #                 # Remapping the name of FP8 kv-scale.
-    #                 name = maybe_remap_kv_scale_name(name, params_dict)
-    #                 if name is None:
-    #                     continue
-
-    #                 param = params_dict[name]
-    #                 weight_loader = getattr(
-    #                     param, "weight_loader", default_weight_loader
-    #                 )
-    #                 weight_loader(param, loaded_weight)
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -694,10 +575,20 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -705,208 +596,127 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader(param, loaded_weight, shard_id, is_quantized=True)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if self.use_fused_moe:
-                    if (
-                        "block_sparse_moe.experts." in name
-                        and ".w1." not in name
-                        and ".w2." not in name
-                        and ".w3." not in name
-                        and name not in params_dict
-                    ):
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
                         continue
-
-                    if ".qzeros" in name:
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
                         continue
-
-                    shard_id = None
-                    expert_id = 0
-
-                    has_any_numbered = (
-                        ".qweight" in name or ".scales" in name or ".g_idx" in name
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        weight_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        is_quantized=True,
                     )
-                    if has_any_numbered and (".w1." in name):
-                        name = name.replace(".w1.", ".w13_")
-                        shard_id = 0
-                    if has_any_numbered and (".w2." in name):
-                        name = name.replace(".w2.", ".w2_")
-                        shard_id = 0
-                    if has_any_numbered and (".w3." in name):
-                        name = name.replace(".w3.", ".w13_")
-                        shard_id = 1
-
-                    exp_string = re.search(r"\.experts\.\d+.", name)
-                    if exp_string:
-                        exp_string = exp_string.group(0)
-                        expert_id = int(exp_string.split(".")[2])
-                        name = name.replace(exp_string, ".experts.")
-
+                    break
                 else:
-                    if "block_sparse_moe.experts." in name and name not in params_dict:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
                         continue
 
-                param = params_dict[name]
-
-                if self.use_fused_moe and shard_id is not None:
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight, name, shard_id, expert_id, True)
-                else:
+                    param = params_dict[name]
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader
                     )
                     weight_loader(param, loaded_weight)
+    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    #     stacked_params_mapping = [
+    #         # (param_name, shard_name, shard_id)
+    #         ("qkv_proj", "q_proj", "q"),
+    #         ("qkv_proj", "k_proj", "k"),
+    #         ("qkv_proj", "v_proj", "v"),
+    #     ]
 
+    #     params_dict = dict(self.named_parameters())
+    #     for name, loaded_weight in weights:
+    #         if "rotary_emb.inv_freq" in name:
+    #             continue
+    #         for param_name, weight_name, shard_id in stacked_params_mapping:
+    #             if weight_name not in name:
+    #                 continue
+    #             name = name.replace(weight_name, param_name)
+    #             # Skip loading extra bias for GPTQ models.
+    #             if name.endswith(".bias") and name not in params_dict:
+    #                 continue
+    #             param = params_dict[name]
+    #             weight_loader = param.weight_loader
+    #             weight_loader(param, loaded_weight, shard_id)
+    #             break
+    #         else:
+    #             # Skip loading extra bias for GPTQ models.
+    #             if name.endswith(".bias") and name not in params_dict:
+    #                 continue
 
-class QuantizedMixtralForCausalLM(nn.Module):
-    fall_back_to_pt_during_load = False
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-
-        # TODO check runs with dtype=float16
-        self.use_fused_moe = config.torch_dtype != torch.float8_e4m3fn
-
-        self.config = config
-        self.quant_config = quant_config
-        self.model = QuantMixtralModel(
-            config, self.use_fused_moe, cache_config, quant_config
-        )
-        self.lm_head = ParallelLMHead(
-            config.vocab_size, config.hidden_size, quant_config=quant_config
-        )
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata)
-        return hidden_states
-
-    def compute_logits(
-        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
-    ) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
-        return logits
-
-    # def make_empty_intermediate_tensors(
-    #     self, batch_size: int, dtype: torch.dtype, device: torch.device
-    # ) -> IntermediateTensors:
-    #     return IntermediateTensors(
-    #         {
-    #             "hidden_states": torch.zeros(
-    #                 (batch_size, self.config.hidden_size), dtype=dtype, device=device
-    #             ),
-    #             "residual": torch.zeros(
-    #                 (batch_size, self.config.hidden_size), dtype=dtype, device=device
-    #             ),
-    #         }
-    #     )
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if self.use_fused_moe:
-                    if (
-                        "block_sparse_moe.experts." in name
-                        and ".w1." not in name
-                        and ".w2." not in name
-                        and ".w3." not in name
-                        and name not in params_dict
-                    ):
-                        continue
+    #             if self.use_fused_moe:
+    #                 if (
+    #                     "block_sparse_moe.experts." in name
+    #                     and ".w1." not in name
+    #                     and ".w2." not in name
+    #                     and ".w3." not in name
+    #                     and name not in params_dict
+    #                 ):
+    #                     continue
 
-                    if ".qzeros" in name:
-                        continue
+    #                 if ".qzeros" in name:
+    #                     continue
 
-                    shard_id = None
-                    expert_id = 0
+    #                 shard_id = None
+    #                 expert_id = 0
 
-                    has_any_numbered = (
-                        ".qweight" in name or ".scales" in name or ".g_idx" in name
-                    )
-                    if has_any_numbered and (".w1." in name):
-                        name = name.replace(".w1.", ".w13_")
-                        shard_id = 0
-                    if has_any_numbered and (".w2." in name):
-                        name = name.replace(".w2.", ".w2_")
-                        shard_id = 0
-                    if has_any_numbered and (".w3." in name):
-                        name = name.replace(".w3.", ".w13_")
-                        shard_id = 1
-
-                    exp_string = re.search(r"\.experts\.\d+.", name)
-                    if exp_string:
-                        exp_string = exp_string.group(0)
-                        expert_id = int(exp_string.split(".")[2])
-                        name = name.replace(exp_string, ".experts.")
+    #                 has_any_numbered = (
+    #                     ".qweight" in name or ".scales" in name or ".g_idx" in name
+    #                 )
+    #                 if has_any_numbered and (".w1." in name):
+    #                     name = name.replace(".w1.", ".w13_")
+    #                     shard_id = 0
+    #                 if has_any_numbered and (".w2." in name):
+    #                     name = name.replace(".w2.", ".w2_")
+    #                     shard_id = 0
+    #                 if has_any_numbered and (".w3." in name):
+    #                     name = name.replace(".w3.", ".w13_")
+    #                     shard_id = 1
+
+    #                 exp_string = re.search(r"\.experts\.\d+.", name)
+    #                 if exp_string:
+    #                     exp_string = exp_string.group(0)
+    #                     expert_id = int(exp_string.split(".")[2])
+    #                     name = name.replace(exp_string, ".experts.")
 
-                else:
-                    if "block_sparse_moe.experts." in name and name not in params_dict:
-                        continue
+    #             else:
+    #                 if "block_sparse_moe.experts." in name and name not in params_dict:
+    #                     continue
 
-                param = params_dict[name]
+    #             param = params_dict[name]
 
-                if self.use_fused_moe and shard_id is not None:
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight, name, shard_id, expert_id, True)
-                else:
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
+    #             if self.use_fused_moe and shard_id is not None:
+    #                 weight_loader = getattr(
+    #                     param, "weight_loader", default_weight_loader
+    #                 )
+    #                 weight_loader(param, loaded_weight, name, shard_id, expert_id, True)
+    #             else:
+    #                 weight_loader = getattr(
+    #                     param, "weight_loader", default_weight_loader
+    #                 )
+    #                 weight_loader(param, loaded_weight)
\ No newline at end of file

From a5d356ec8c09a5033f3222ffdeaf682c791fadcd Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:44:11 -0700
Subject: [PATCH 079/106] Load with name not weight name

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 64572444ee0f7..0a700a4c57f89 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -618,7 +618,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader(
                         param,
                         loaded_weight,
-                        weight_name,
+                        name,
                         shard_id=shard_id,
                         expert_id=expert_id,
                         is_quantized=True,

From 62c0135d186dba941c2e35ccb5eb4c49891ef3e5 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:46:13 -0700
Subject: [PATCH 080/106] params dict should load from old name

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0a700a4c57f89..84b150a245ef0 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -592,6 +592,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
+                param = params_dict[name]
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
@@ -600,7 +601,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if is_pp_missing_parameter(name, self):
                     continue
 
-                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id, is_quantized=True)
                 break
@@ -609,11 +609,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
-                    name = name.replace(weight_name, param_name)
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
                     param = params_dict[name]
+                    name = name.replace(weight_name, param_name)
                     weight_loader = param.weight_loader
                     weight_loader(
                         param,

From d23c00c63796eaaa0b96dd703e993d7113a22bbb Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:47:33 -0700
Subject: [PATCH 081/106] logging name and parmas

---
 vllm/model_executor/models/mixtral.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 84b150a245ef0..b1fd87dd080c0 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -62,7 +62,8 @@
 
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
-
+import logging
+logger = logging.getLogger(__name__)
 
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
@@ -585,6 +586,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         )
 
         params_dict = dict(self.named_parameters())
+        logger.error(params_dict)
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -612,6 +614,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    logger.error(name)
                     param = params_dict[name]
                     name = name.replace(weight_name, param_name)
                     weight_loader = param.weight_loader

From 6dda4475a2cd8936353eb33086989651dd424674 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:48:59 -0700
Subject: [PATCH 082/106] log expert parmas map

---
 vllm/model_executor/models/mixtral.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index b1fd87dd080c0..2265d1ac590bd 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -586,7 +586,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         )
 
         params_dict = dict(self.named_parameters())
-        logger.error(params_dict)
+        logger.error(params_dict.keys())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -594,8 +594,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                param = params_dict[name]
                 name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
@@ -607,6 +607,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id, is_quantized=True)
                 break
             else:
+                logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
@@ -614,9 +615,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    name = name.replace(weight_name, param_name)
                     logger.error(name)
                     param = params_dict[name]
-                    name = name.replace(weight_name, param_name)
                     weight_loader = param.weight_loader
                     weight_loader(
                         param,

From 67ce7b65309a7cd9791957db114e7eb0403190a5 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 13:58:38 -0700
Subject: [PATCH 083/106] parity with prev commits

---
 vllm/model_executor/models/mixtral.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 2265d1ac590bd..a11f2d8f31f8c 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -595,7 +595,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
-                param = params_dict[name]
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
@@ -603,6 +602,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if is_pp_missing_parameter(name, self):
                     continue
 
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id, is_quantized=True)
                 break
@@ -613,10 +613,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     if weight_name not in name:
                         continue
                     # Skip layers on other devices.
+                    name = name.replace(weight_name, param_name)
                     if is_pp_missing_parameter(name, self):
                         continue
-                    name = name.replace(weight_name, param_name)
-                    logger.error(name)
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(

From bd933c975d8a327a79f0242996551ae55155b1d5 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:02:09 -0700
Subject: [PATCH 084/106] Adding qzeros to mapping

---
 vllm/model_executor/layers/fused_moe/layer.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 0b06ee86a308d..34bad93f052dc 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -399,4 +399,15 @@ def make_expert_params_mapping(
                 shard_id,
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the g_idx and g_idx_sort_indices scales for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                "experts.w13_qzeros"
+                if weight_name in gate_up else "experts.w2_qzeros",
+                f"experts.{expert_id}.{weight_name}.qzeros",
+                expert_id,
+                shard_id,
+            ) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
         ])

From 77cd09561efa4798c26788dfff1352cfa08b39fa Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:05:14 -0700
Subject: [PATCH 085/106] Remove log

---
 vllm/model_executor/models/mixtral.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a11f2d8f31f8c..fc170cee534cf 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -586,7 +586,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         )
 
         params_dict = dict(self.named_parameters())
-        logger.error(params_dict.keys())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -607,7 +606,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id, is_quantized=True)
                 break
             else:
-                logger.error(expert_params_mapping)
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:

From 529191eb7a1b8fe55a9d56bee373b348f825c4df Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:05:47 -0700
Subject: [PATCH 086/106] Remove is quantized

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index fc170cee534cf..fce9305008abd 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -603,7 +603,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id, is_quantized=True)
+                weight_loader(param, loaded_weight, shard_id)
                 break
             else:
                 for mapping in expert_params_mapping:

From 2450543233960ca5bfaaa3f21d38a863d2ece851 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:21:32 -0700
Subject: [PATCH 087/106] Assume fused true

---
 vllm/model_executor/models/mixtral.py | 194 +++-----------------------
 1 file changed, 18 insertions(+), 176 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index fce9305008abd..a6016b916c883 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -120,53 +120,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape)
 
-
-class MixtralMLP(nn.Module):
-    def __init__(
-        self,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.num_experts = num_experts
-        self.ffn_dim = intermediate_size
-        self.hidden_dim = hidden_size
-
-        self.w1 = ReplicatedLinear(
-            self.hidden_dim, self.ffn_dim, bias=False, quant_config=quant_config
-        )
-        self.w2 = ReplicatedLinear(
-            self.ffn_dim, self.hidden_dim, bias=False, quant_config=quant_config
-        )
-        self.w3 = ReplicatedLinear(
-            self.hidden_dim, self.ffn_dim, bias=False, quant_config=quant_config
-        )
-
-        # TODO: Use vllm's SiluAndMul
-        self.act_fn = nn.SiLU()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        w1_out, _ = self.w1(hidden_states)
-        w1_out = self.act_fn(w1_out)
-        w3_out, _ = self.w3(hidden_states)
-        current_hidden_states = w1_out * w3_out
-        current_hidden_states, _ = self.w2(current_hidden_states)
-        return current_hidden_states
-
-
 class QuantMixtralMoE(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        use_fused_moe: bool,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
         super().__init__()
         self.config = config
-        self.use_fused_moe = use_fused_moe
         self.quant_config = quant_config
         self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -184,35 +146,7 @@ def __init__(
         if not self.expert_indicies:
             raise ValueError(f"Rank {self.rank} has no experts assigned to it.")
 
-        if self.use_fused_moe:
-            params_dtype = torch.float16
-            self.experts = FusedMoE(
-                num_experts=self.num_total_experts,
-                top_k=self.top_k,
-                hidden_size=config.hidden_size,
-                intermediate_size=config.intermediate_size,
-                params_dtype=params_dtype,
-                reduce_results=True,
-                renormalize=True,
-                quant_config=quant_config,
-                tp_size=self.tp_size,
-                prefix=f"{prefix}.experts",
-            )
-        else:
-            self.experts = nn.ModuleList(
-                [
-                    MixtralMLP(
-                        self.num_total_experts,
-                        config.hidden_size,
-                        config.intermediate_size,
-                        quant_config=quant_config,
-                    )
-                    if idx in self.expert_indicies
-                    else None
-                    for idx in range(self.num_total_experts)
-                ]
-            )
-
+        params_dtype = torch.float16
         self.gate = ReplicatedLinear(
             config.hidden_size,
             self.num_total_experts,
@@ -220,40 +154,27 @@ def __init__(
             quant_config=None,
             prefix=f"{prefix}.gate",
         )
+        self.experts = FusedMoE(
+            num_experts=self.num_total_experts,
+            top_k=self.top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=quant_config,
+            tp_size=self.tp_size,
+            prefix=f"{prefix}.experts",
+        )
+
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
+        _, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits, _ = self.gate(hidden_states)
 
-        if self.use_fused_moe:
-            ret = self.experts(hidden_states.half(), router_logits)
-            return ret.bfloat16()
-        else:
-            routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-            routing_weights, selected_experts = torch.topk(
-                routing_weights, self.top_k, dim=-1
-            )
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-            final_hidden_states = None
-            for expert_idx in self.expert_indicies:
-                expert_layer = self.experts[expert_idx]
-                expert_mask = selected_experts == expert_idx
-                expert_weights = (routing_weights * expert_mask).sum(
-                    dim=-1, keepdim=True
-                )
-
-                current_hidden_states = expert_layer(hidden_states).mul_(expert_weights)
-                if final_hidden_states is None:
-                    final_hidden_states = current_hidden_states
-                else:
-                    final_hidden_states.add_(current_hidden_states)
-
-            return tensor_model_parallel_all_reduce(final_hidden_states).view(
-                num_tokens, hidden_dim
-            )
-
+        ret = self.experts(hidden_states.half(), router_logits)
+        return ret.bfloat16()
 
 class MixtralAttention(nn.Module):
     def __init__(
@@ -641,83 +562,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(
                         param, "weight_loader", default_weight_loader
                     )
-                    weight_loader(param, loaded_weight)
-    # def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-    #     stacked_params_mapping = [
-    #         # (param_name, shard_name, shard_id)
-    #         ("qkv_proj", "q_proj", "q"),
-    #         ("qkv_proj", "k_proj", "k"),
-    #         ("qkv_proj", "v_proj", "v"),
-    #     ]
-
-    #     params_dict = dict(self.named_parameters())
-    #     for name, loaded_weight in weights:
-    #         if "rotary_emb.inv_freq" in name:
-    #             continue
-    #         for param_name, weight_name, shard_id in stacked_params_mapping:
-    #             if weight_name not in name:
-    #                 continue
-    #             name = name.replace(weight_name, param_name)
-    #             # Skip loading extra bias for GPTQ models.
-    #             if name.endswith(".bias") and name not in params_dict:
-    #                 continue
-    #             param = params_dict[name]
-    #             weight_loader = param.weight_loader
-    #             weight_loader(param, loaded_weight, shard_id)
-    #             break
-    #         else:
-    #             # Skip loading extra bias for GPTQ models.
-    #             if name.endswith(".bias") and name not in params_dict:
-    #                 continue
-
-    #             if self.use_fused_moe:
-    #                 if (
-    #                     "block_sparse_moe.experts." in name
-    #                     and ".w1." not in name
-    #                     and ".w2." not in name
-    #                     and ".w3." not in name
-    #                     and name not in params_dict
-    #                 ):
-    #                     continue
-
-    #                 if ".qzeros" in name:
-    #                     continue
-
-    #                 shard_id = None
-    #                 expert_id = 0
-
-    #                 has_any_numbered = (
-    #                     ".qweight" in name or ".scales" in name or ".g_idx" in name
-    #                 )
-    #                 if has_any_numbered and (".w1." in name):
-    #                     name = name.replace(".w1.", ".w13_")
-    #                     shard_id = 0
-    #                 if has_any_numbered and (".w2." in name):
-    #                     name = name.replace(".w2.", ".w2_")
-    #                     shard_id = 0
-    #                 if has_any_numbered and (".w3." in name):
-    #                     name = name.replace(".w3.", ".w13_")
-    #                     shard_id = 1
-
-    #                 exp_string = re.search(r"\.experts\.\d+.", name)
-    #                 if exp_string:
-    #                     exp_string = exp_string.group(0)
-    #                     expert_id = int(exp_string.split(".")[2])
-    #                     name = name.replace(exp_string, ".experts.")
-
-    #             else:
-    #                 if "block_sparse_moe.experts." in name and name not in params_dict:
-    #                     continue
-
-    #             param = params_dict[name]
-
-    #             if self.use_fused_moe and shard_id is not None:
-    #                 weight_loader = getattr(
-    #                     param, "weight_loader", default_weight_loader
-    #                 )
-    #                 weight_loader(param, loaded_weight, name, shard_id, expert_id, True)
-    #             else:
-    #                 weight_loader = getattr(
-    #                     param, "weight_loader", default_weight_loader
-    #                 )
-    #                 weight_loader(param, loaded_weight)
\ No newline at end of file
+                    weight_loader(param, loaded_weight)
\ No newline at end of file

From 8cba45e2de78fd78b9aef7676ed70deef9b4d5c4 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:22:10 -0700
Subject: [PATCH 088/106] rm fused true

---
 vllm/model_executor/models/mixtral.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a6016b916c883..5861cf6df9bc8 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -289,7 +289,6 @@ def __init__(
         # )
         self.block_sparse_moe = QuantMixtralMoE(
             config,
-            use_fused_moe=True,
             quant_config=quant_config,
             prefix=f"{prefix}.block_sparse_moe",
         )
@@ -426,7 +425,6 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
-        self.use_fused_moe = config.torch_dtype != torch.float8_e4m3fn
         self.model = MixtralModel(
             config, cache_config, quant_config, lora_config=lora_config, prefix="model"
         )

From 10940a5c503d850f8a15f08359a0e88593cabdbd Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:40:31 -0700
Subject: [PATCH 089/106] Switching to mixtral moe

---
 vllm/model_executor/models/mixtral.py | 32 +++++++++------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 5861cf6df9bc8..adc94a226c9d1 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -134,17 +134,6 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_total_experts = config.num_local_experts
         self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.num_total_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.num_total_experts}."
-            )
-        # Split experts equally between ranks
-        self.expert_indicies = np.array_split(
-            range(self.num_total_experts), self.tp_size
-        )[self.rank].tolist()
-        if not self.expert_indicies:
-            raise ValueError(f"Rank {self.rank} has no experts assigned to it.")
 
         params_dtype = torch.float16
         self.gate = ReplicatedLinear(
@@ -279,19 +268,20 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
-        # self.block_sparse_moe = MixtralMoE(
-        #     num_experts=config.num_local_experts,
-        #     top_k=config.num_experts_per_tok,
-        #     hidden_size=config.hidden_size,
-        #     intermediate_size=config.intermediate_size,
-        #     quant_config=quant_config,
-        #     prefix=f"{prefix}.block_sparse_moe",
-        # )
-        self.block_sparse_moe = QuantMixtralMoE(
-            config,
+        self.block_sparse_moe = MixtralMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
             quant_config=quant_config,
+            tp_size=get_tensor_model_parallel_world_size(),
             prefix=f"{prefix}.block_sparse_moe",
         )
+        # self.block_sparse_moe = QuantMixtralMoE(
+        #     config,
+        #     quant_config=quant_config,
+        #     prefix=f"{prefix}.block_sparse_moe",
+        # )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps

From 895ffbe2704794574c38a7dfc6352e810b0d538e Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:49:17 -0700
Subject: [PATCH 090/106] Precision changes

---
 vllm/model_executor/models/mixtral.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index adc94a226c9d1..7f2109e337e27 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -93,7 +93,6 @@ def __init__(
             hidden_size,
             num_experts,
             bias=False,
-            params_dtype=params_dtype,
             quant_config=None,
             prefix=f"{prefix}.gate",
         )
@@ -117,8 +116,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(orig_shape)
+        final_hidden_states = self.experts(hidden_states.half(), router_logits)
+        return final_hidden_states.view(orig_shape).bfloat16()
 
 class QuantMixtralMoE(nn.Module):
     def __init__(
@@ -275,6 +274,7 @@ def __init__(
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
             tp_size=get_tensor_model_parallel_world_size(),
+            params_dtype=torch.float16,
             prefix=f"{prefix}.block_sparse_moe",
         )
         # self.block_sparse_moe = QuantMixtralMoE(

From e54b2e4d47e9d748013fc4aede3ac93efd8278ec Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 14:58:19 -0700
Subject: [PATCH 091/106] Cleanup

---
 vllm/model_executor/models/mixtral.py | 51 ---------------------------
 1 file changed, 51 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 7f2109e337e27..9f01f9349335e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -118,52 +118,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states.half(), router_logits)
         return final_hidden_states.view(orig_shape).bfloat16()
-
-class QuantMixtralMoE(nn.Module):
-    def __init__(
-        self,
-        config: MixtralConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.config = config
-        self.quant_config = quant_config
-        self.rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.num_total_experts = config.num_local_experts
-        self.top_k = config.num_experts_per_tok
-
-        params_dtype = torch.float16
-        self.gate = ReplicatedLinear(
-            config.hidden_size,
-            self.num_total_experts,
-            bias=False,
-            quant_config=None,
-            prefix=f"{prefix}.gate",
-        )
-        self.experts = FusedMoE(
-            num_experts=self.num_total_experts,
-            top_k=self.top_k,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            params_dtype=params_dtype,
-            reduce_results=True,
-            renormalize=True,
-            quant_config=quant_config,
-            tp_size=self.tp_size,
-            prefix=f"{prefix}.experts",
-        )
-
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        _, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        router_logits, _ = self.gate(hidden_states)
-
-        ret = self.experts(hidden_states.half(), router_logits)
-        return ret.bfloat16()
-
 class MixtralAttention(nn.Module):
     def __init__(
         self,
@@ -277,11 +231,6 @@ def __init__(
             params_dtype=torch.float16,
             prefix=f"{prefix}.block_sparse_moe",
         )
-        # self.block_sparse_moe = QuantMixtralMoE(
-        #     config,
-        #     quant_config=quant_config,
-        #     prefix=f"{prefix}.block_sparse_moe",
-        # )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps

From b4f23dc6b0a4677fc3bf137576afe93e25b2184b Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 15:03:19 -0700
Subject: [PATCH 092/106] Mixtral quant parity:

---
 vllm/model_executor/models/mixtral_quant.py | 241 --------------------
 1 file changed, 241 deletions(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index c0837802ce318..c9143552224f5 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -361,75 +361,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-
-class LoRAMixtralModel(nn.Module):
-    def __init__(
-        self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: MixtralDecoderLayer(
-                config, use_fused_moe=True, cache_config=cache_config, quant_config=quant_config
-            ),
-            prefix=f"{prefix}.layers",
-        )
-
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
-    ) -> torch.Tensor:
-        if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-                residual,
-            )
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors(
-                {"hidden_states": hidden_states, "residual": residual}
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
 class MixtralForCausalLM(nn.Module):
     fall_back_to_pt_during_load = False
 
@@ -488,178 +419,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if self.use_fused_moe:
-                    if ("block_sparse_moe.experts." in name
-                            and ".w1." not in name and ".w2." not in name
-                            and ".w3." not in name
-                            and name not in params_dict):
-                        continue
-
-                    if (".qzeros" in name):
-                        continue
-
-                    shard_id = None
-                    expert_id = 0
-
-                    has_any_numbered = (".qweight" in name or ".scales" in name
-                                        or ".g_idx" in name)
-                    if (has_any_numbered and (".w1." in name)):
-                        name = name.replace(".w1.", ".w13_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w2." in name)):
-                        name = name.replace(".w2.", ".w2_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w3." in name)):
-                        name = name.replace(".w3.", ".w13_")
-                        shard_id = 1
-
-                    exp_string = re.search(r"\.experts\.\d+.", name)
-                    if exp_string:
-                        exp_string = exp_string.group(0)
-                        expert_id = int(exp_string.split(".")[2])
-                        name = name.replace(exp_string, ".experts.")
-
-                else:
-                    if ("block_sparse_moe.experts." in name
-                            and name not in params_dict):
-                        continue
-
-                param = params_dict[name]
-
-                if self.use_fused_moe and shard_id is not None:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight, name, shard_id,
-                                  expert_id, True)
-                else:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-class LoRAEnabledMixtralForCausalLM(nn.Module, SupportsLoRA):
-    fall_back_to_pt_during_load = False
-
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-    embedding_modules = {
-        "embed_tokens": "input_embeddings",
-        "lm_head": "output_embeddings",
-    }
-    embedding_padding_modules = ["lm_head"]
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.lora_config = lora_config
-        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
-        self.model = LoRAMixtralModel(
-            config=config, cache_config=cache_config, quant_config=quant_config, lora_config=lora_config, prefix="model"
-        )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
-            quant_config=quant_config,
-        )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
-        self.sampler = Sampler()
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> torch.Tensor:
-        hidden_states = self.model(
-            input_ids, positions, kv_caches, attn_metadata, intermediate_tensors
-        )
-        return hidden_states
-
-    def compute_logits(
-        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
-    ) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
-        return logits
-
-    def make_empty_intermediate_tensors(
-        self, batch_size: int, dtype: torch.dtype, device: torch.device
-    ) -> IntermediateTensors:
-        return IntermediateTensors(
-            {
-                "hidden_states": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-                "residual": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-            }
-        )
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:

From d59fe3b166ad597517f451fb4d30d5223851517f Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 15:08:08 -0700
Subject: [PATCH 093/106] fixing tests

---
 tests/kernels/test_moe.py                                | 1 +
 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index e657581df05a0..b53f578988214 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -239,6 +239,7 @@ def test_fused_marlin_moe(
         renormalize=False,
         w1_scale=scales1,
         w2_scale=scales2,
+        num_bits=num_bits,
     )
 
     assert compute_max_diff(marlin_output, triton_output) < 4e-2
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index efafcef2f1ee7..3e080a1393e16 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -120,7 +120,7 @@ def fused_moe_marlin(
         False,
     )
 
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, (num_bits // 2) * N))
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
     intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
         intermediate_cache2,

From 0d9cbdc95a12524b750ab3f39aa4fffa7f692d29 Mon Sep 17 00:00:00 2001
From: Dhruva Bansal <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 23:24:02 +0000
Subject: [PATCH 094/106] Tests working and correctness verified

---
 vllm/model_executor/models/mixtral.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9f01f9349335e..9d99606b49ec3 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -80,20 +80,21 @@ def __init__(
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: Optional[torch.dtype] = torch.float16,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = hidden_size
-
+        self.params_dtype = params_dtype
         # Gate always runs at half / full precision for now.
         self.gate = ReplicatedLinear(
             hidden_size,
             num_experts,
             bias=False,
             quant_config=None,
+            params_dtype=params_dtype,
             prefix=f"{prefix}.gate",
         )
 
@@ -112,12 +113,12 @@ def __init__(
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
-        orig_shape = hidden_states.shape
-        hidden_states = hidden_states.view(-1, self.hidden_size)
+        orig_shape, orig_type = hidden_states.shape, hidden_states.dtype
+        hidden_states = hidden_states.view(-1, self.hidden_size).to(self.params_dtype)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states.half(), router_logits)
-        return final_hidden_states.view(orig_shape).bfloat16()
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape).to(orig_type)
 class MixtralAttention(nn.Module):
     def __init__(
         self,

From 112aa40fd31fcfe431581716c25f732f1b482c04 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Thu, 15 Aug 2024 16:29:51 -0700
Subject: [PATCH 095/106] Formating

---
 .../layers/fused_moe/fused_moe_marlin.py      |  4 +-
 .../layers/quantization/gptq_marlin.py        |  3 -
 vllm/model_executor/models/mixtral.py         | 96 ++++++++++---------
 vllm/model_executor/models/mixtral_quant.py   |  7 +-
 4 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 3e080a1393e16..7e834e3250f74 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -56,8 +56,8 @@ def fused_moe_marlin(
         0], "Number of tokens mismatch"
     assert hidden_states.shape[
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[
-        1] == w2.shape[2] // (num_bits // 2), "Hidden size mismatch w2"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2), "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 90762efef8108..64453d61145ef 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -539,8 +539,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                             device=device),
                 requires_grad=False,
             )
-        # logger.error(f"W13 qweight size - {layer.w13_qweight.size()}")
-        # logger.error(f"Quant Config: {self.quant_config}")
         # Repack weights
         marlin_w13_qweight = ops.gptq_marlin_moe_repack(
             layer.w13_qweight,
@@ -567,7 +565,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             group_size=self.quant_config.group_size,
         )
         replace_tensor(layer, "w13_scales", marlin_w13_scales)
-        # logger.error(f"{layer.w2_scales.size()}, {layer.intermediate_size_per_partition}, {self.quant_config.group_size}")
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
             size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9d99606b49ec3..f157a21fd27ae 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -63,8 +63,10 @@
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
     across all ranks.
@@ -114,12 +116,16 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape, orig_type = hidden_states.shape, hidden_states.dtype
-        hidden_states = hidden_states.view(-1, self.hidden_size).to(self.params_dtype)
+        hidden_states = hidden_states.view(-1, self.hidden_size).to(
+            self.params_dtype)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape).to(orig_type)
+
+
 class MixtralAttention(nn.Module):
+
     def __init__(
         self,
         hidden_size: int,
@@ -201,6 +207,7 @@ def forward(
 
 
 class MixtralDecoderLayer(nn.Module):
+
     def __init__(
         self,
         config: MixtralConfig,
@@ -232,10 +239,10 @@ def __init__(
             params_dtype=torch.float16,
             prefix=f"{prefix}.block_sparse_moe",
         )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -250,7 +257,8 @@ def forward(
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -259,12 +267,14 @@ def forward(
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
         hidden_states = self.block_sparse_moe(hidden_states)
         return hidden_states, residual
 
 
 class MixtralModel(nn.Module):
+
     def __init__(
         self,
         config: MixtralConfig,
@@ -275,11 +285,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
 
@@ -324,12 +331,14 @@ def forward(
                 residual,
             )
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors(
-                {"hidden_states": hidden_states, "residual": residual}
-            )
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+
 class MixtralForCausalLM(nn.Module, SupportsLoRA):
     fall_back_to_pt_during_load = False
 
@@ -365,9 +374,11 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
-        self.model = MixtralModel(
-            config, cache_config, quant_config, lora_config=lora_config, prefix="model"
-        )
+        self.model = MixtralModel(config,
+                                  cache_config,
+                                  quant_config,
+                                  lora_config=lora_config,
+                                  prefix="model")
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -381,9 +392,8 @@ def __init__(
             if not lora_config else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
         self.sampler = Sampler()
 
     def forward(
@@ -394,30 +404,29 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(
-            input_ids, positions, kv_caches, attn_metadata, intermediate_tensors
-        )
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
-    def compute_logits(
-        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
-    ) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def make_empty_intermediate_tensors(
-        self, batch_size: int, dtype: torch.dtype, device: torch.device
-    ) -> IntermediateTensors:
-        return IntermediateTensors(
-            {
-                "hidden_states": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-                "residual": torch.zeros(
-                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
-                ),
-            }
-        )
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
 
     def sample(
         self,
@@ -497,7 +506,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
\ No newline at end of file
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index c9143552224f5..3ff70d222b518 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -55,7 +55,10 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 import logging
+
 logger = logging.getLogger(__name__)
+
+
 class MixtralMLP(nn.Module):
 
     def __init__(
@@ -361,6 +364,7 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+
 class MixtralForCausalLM(nn.Module):
     fall_back_to_pt_during_load = False
 
@@ -374,7 +378,6 @@ def __init__(
 
         # TODO check runs with dtype=float16
         self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
-        logger.error(f"Using fused MoE: {self.use_fused_moe}")
         self.config = config
         self.quant_config = quant_config
         self.model = MixtralModel(config, self.use_fused_moe, cache_config,
@@ -485,4 +488,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 else:
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    weight_loader(param, loaded_weight)
\ No newline at end of file
+                    weight_loader(param, loaded_weight)

From 1ca90987b6c8a2266217a8bc863d9d4e834ba012 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 19 Aug 2024 10:32:59 -0700
Subject: [PATCH 096/106] Moving single marlin alongside fused marlin

---
 tests/kernels/test_moe.py                     |  4 +-
 .../layers/fused_moe/fused_moe.py             | 78 -------------------
 .../layers/fused_moe/fused_moe_marlin.py      | 77 ++++++++++++++++++
 3 files changed, 79 insertions(+), 80 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index b53f578988214..41d5478c2e5d0 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -10,8 +10,8 @@
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe, single_marlin_moe
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin, single_marlin_moe
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize, )
 from vllm.model_executor.models.mixtral import MixtralMoE
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 9ae5859c4da0c..797bbfe5c71c1 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -627,81 +627,3 @@ def fused_moe(
                          w2_scale=w2_scale,
                          a1_scale=a1_scale,
                          a2_scale=a2_scale)
-
-
-def single_marlin_moe(
-    hidden_states: torch.Tensor,
-    w: torch.Tensor,
-    scales: torch.Tensor,
-    gating_output: torch.Tensor,
-    g_idx: torch.Tensor,
-    rand_perm: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-    override_config: Optional[Dict[str, Any]] = None,
-    use_fp8: bool = False,
-) -> torch.Tensor:
-    """
-    This function computes a Marlin MoE MMM using weights w
-    and top-k gating mechanism. It is meant for testing and debugging.
-
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w (torch.Tensor): The first set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk (int): The number of top-k experts to select.
-    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override
-        for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w and w2. Defaults to False.
-
-    Returns:
-    - torch.Tensor: The output tensor after applying the MoE layer.
-    """
-    # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
-    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
-    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w.is_contiguous(), "Expert weights must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    M, K = hidden_states.shape
-    E = w.shape[0]
-    N = w.shape[2] // 2
-
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
-
-    # This might not be an optimal config for a single MMM
-    get_config_func = functools.partial(try_get_optimal_moe_config,
-                                        w.shape,
-                                        w.shape,
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
-                                        override_config=override_config,
-                                        is_marlin=True)
-    config = get_config_func(M)
-
-    block_size_m = config['BLOCK_SIZE_M']
-
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
-
-    max_workspace_size = (N // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device="cuda",
-                            requires_grad=False)
-
-    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        g_idx, rand_perm, workspace, M, N, K, True, E, topk, block_size_m,
-        True, False)
-
-    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 7e834e3250f74..48760daba2e41 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -8,6 +8,83 @@
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 
 
+def single_marlin_moe(
+    hidden_states: torch.Tensor,
+    w: torch.Tensor,
+    scales: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx: torch.Tensor,
+    rand_perm: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+) -> torch.Tensor:
+    """
+    This function computes a Marlin MoE MMM using weights w
+    and top-k gating mechanism. It is meant for testing and debugging.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w (torch.Tensor): The first set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w and w2. Defaults to False.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
+    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w.is_contiguous(), "Expert weights must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    M, K = hidden_states.shape
+    E = w.shape[0]
+    N = w.shape[2] // 2
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    # This might not be an optimal config for a single MMM
+    get_config_func = functools.partial(try_get_optimal_moe_config,
+                                        w.shape,
+                                        w.shape,
+                                        topk_ids.shape[1],
+                                        "float8" if use_fp8 else None,
+                                        override_config=override_config,
+                                        is_marlin=True)
+    config = get_config_func(M)
+
+    block_size_m = config['BLOCK_SIZE_M']
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = (N // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
+        g_idx, rand_perm, workspace, M, N, K, True, E, topk, block_size_m,
+        True, False)
+
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+
 def fused_moe_marlin(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,

From 4d414252481214c5a66b5defc9245dbc69ae45d8 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 19 Aug 2024 10:36:11 -0700
Subject: [PATCH 097/106] Removing unused imports

---
 vllm/model_executor/layers/fused_moe/layer.py | 3 ---
 vllm/model_executor/models/mixtral.py         | 7 +------
 vllm/model_executor/models/mixtral_quant.py   | 9 +++------
 3 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 34bad93f052dc..825236a6e3bf0 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,11 +1,8 @@
-import enum
 from abc import abstractmethod
-from enum import Enum
 from typing import List, Optional, Tuple
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index f157a21fd27ae..dc46e2d91284d 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -22,10 +22,7 @@
 # limitations under the License.
 """Inference-only Mixtral model."""
 from typing import Iterable, List, Optional, Tuple
-import re
 import torch
-import numpy as np
-import torch.nn.functional as F
 from torch import nn
 from transformers import MixtralConfig
 
@@ -33,9 +30,7 @@
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import (
     get_pp_group,
-    get_tensor_model_parallel_world_size,
-    get_tensor_model_parallel_rank,
-    tensor_model_parallel_all_reduce,
+    get_tensor_model_parallel_world_size
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 3ff70d222b518..2bbde985ecf0e 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -30,12 +30,9 @@
 from torch import nn
 from transformers import MixtralConfig
 
-from .interfaces import SupportsLoRA
-from .utils import is_pp_missing_parameter, make_layers
-
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+from vllm.config import CacheConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -49,7 +46,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding, DEFAULT_VOCAB_PADDING_SIZE)
+    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput

From 4907f43ecefd851deaace753f904cfa4e1c4d368 Mon Sep 17 00:00:00 2001
From: DhruvaBansal00 <dhruva.bansal00@gmail.com>
Date: Mon, 19 Aug 2024 11:15:59 -0700
Subject: [PATCH 098/106] single marlin moe import

---
 vllm/model_executor/layers/fused_moe/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index beb94f10a557e..212c3ac846e55 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,4 @@
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin
-from vllm.model_executor.layers.fused_moe.fused_moe import single_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import fused_moe_marlin, single_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
 from vllm.triton_utils import HAS_TRITON
 

From 315e3b605df96d1edf425d921f7a2ccdbdd93ac5 Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Wed, 21 Aug 2024 14:02:49 +0000
Subject: [PATCH 099/106] Unify shard_id to be of str w[1-3] format

---
 vllm/model_executor/layers/fused_moe/layer.py | 21 +++++++++----------
 .../layers/quantization/experts_int8.py       |  2 +-
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 484b22d3ad027..160f6948648af 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -234,9 +234,9 @@ def weight_loader(
                     or "_qzeros" in weight_name):
                 if "w13" in weight_name:
                     shard_size = loaded_weight.size()[-1]
-                    if shard_id == 0:
+                    if shard_id == "w1":
                         param_data[expert_id, :, :shard_size] = loaded_weight
-                    elif shard_id == 2 or shard_id == 1:
+                    elif shard_id == "w3" or shard_id == "w2":
                         param_data[expert_id, :, shard_size:] = loaded_weight
                     else:
                         raise ValueError(f"Invalid shard_id: {shard_id}: "
@@ -357,12 +357,11 @@ def forward(self, hidden_states: torch.Tensor,
     def make_expert_params_mapping(
             cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
             ckpt_up_proj_name: str,
-            num_experts: int) -> List[Tuple[str, str, int, int]]:
+            num_experts: int) -> List[Tuple[str, str, int, str]]:
         gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
         gate_down_up = [
             ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
         ]
-
         return ([
             # These are the weight scales for the experts
             # (param_name, weight_name, expert_id, shard_id)
@@ -371,7 +370,7 @@ def make_expert_params_mapping(
                 if weight_name in gate_up else "experts.w2_scale",
                 f"experts.{expert_id}.{weight_name}.weight_scale",
                 expert_id,
-                shard_id,
+                f"w{shard_id + 1}",
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
@@ -382,7 +381,7 @@ def make_expert_params_mapping(
                 if weight_name in gate_up else "experts.w2_weight",
                 f"experts.{expert_id}.{weight_name}.weight",
                 expert_id,
-                shard_id,
+                f"w{shard_id + 1}",
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
@@ -393,7 +392,7 @@ def make_expert_params_mapping(
                 if weight_name in gate_up else "experts.w2_scales",
                 f"experts.{expert_id}.{weight_name}.scales",
                 expert_id,
-                shard_id,
+                f"w{shard_id + 1}",
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
@@ -404,7 +403,7 @@ def make_expert_params_mapping(
                 if weight_name in gate_up else "experts.a2_scale",
                 f"experts.{expert_id}.{weight_name}.input_scale",
                 expert_id,
-                shard_id,
+                f"a{shard_id + 1}",
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
@@ -415,7 +414,7 @@ def make_expert_params_mapping(
                 if weight_name in gate_up else "experts.w2_qweight",
                 f"experts.{expert_id}.{weight_name}.qweight",
                 expert_id,
-                shard_id,
+                f"w{shard_id + 1}",
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
@@ -426,7 +425,7 @@ def make_expert_params_mapping(
                 if weight_name in gate_up else "experts.w2_g_idx",
                 f"experts.{expert_id}.{weight_name}.g_idx",
                 expert_id,
-                shard_id,
+                f"w{shard_id + 1}",
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ] + [
@@ -437,7 +436,7 @@ def make_expert_params_mapping(
                 if weight_name in gate_up else "experts.w2_qzeros",
                 f"experts.{expert_id}.{weight_name}.qzeros",
                 expert_id,
-                shard_id,
+                f"w{shard_id + 1}",
             ) for expert_id in range(num_experts)
             for shard_id, weight_name in enumerate(gate_down_up)
         ])
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index dabf17df78fef..153bccc303ef1 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -157,7 +157,7 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter,
                 layer.w2_scale.data[expert_id, :].copy_(scales[:, 0])
             else:
                 raise ValueError(
-                    f"Shard id must be in [0,1,2] but got {shard_id}")
+                    f"Shard id must be in ['w1','w2','w3'] but got {shard_id}")
             weight_loader(param, loaded_weight, weight_name, shard_id,
                           expert_id)
 

From 7956a696804085a96d269a1c521fa443ad6c2ddd Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Mon, 26 Aug 2024 07:00:39 +0000
Subject: [PATCH 100/106] Unfused codepath for non-supported quant_types

---
 vllm/model_executor/models/mixtral.py | 255 +++++++++++++++++++-------
 1 file changed, 191 insertions(+), 64 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3edc578e519bf..3cb1938953506 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -25,12 +25,15 @@
 from typing import Iterable, List, Optional, Tuple
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 from transformers import MixtralConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -46,6 +49,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.scalar_type import scalar_types
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
@@ -54,6 +58,45 @@
 logger = logging.getLogger(__name__)
 
 
+class MixtralMLP(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.num_experts = num_experts
+        self.ffn_dim = intermediate_size
+        self.hidden_dim = hidden_size
+
+        self.w1 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+        self.w2 = ReplicatedLinear(self.ffn_dim,
+                                   self.hidden_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+        self.w3 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   quant_config=quant_config)
+
+        # TODO: Use vllm's SiluAndMul
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        w1_out, _ = self.w1(hidden_states)
+        w1_out = self.act_fn(w1_out)
+        w3_out, _ = self.w3(hidden_states)
+        current_hidden_states = w1_out * w3_out
+        current_hidden_states, _ = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
     across all ranks.
@@ -69,6 +112,7 @@ def __init__(
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
+        use_fused_moe: bool,
         params_dtype: Optional[torch.dtype] = torch.float16,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
@@ -87,28 +131,68 @@ def __init__(
             prefix=f"{prefix}.gate",
         )
 
-        self.experts = FusedMoE(
-            num_experts=num_experts,
-            top_k=top_k,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            params_dtype=params_dtype,
-            reduce_results=True,
-            renormalize=True,
-            quant_config=quant_config,
-            tp_size=tp_size,
-            prefix=f"{prefix}.experts",
-        )
+        self.use_fused_moe = use_fused_moe
+        if self.use_fused_moe:
+            self.experts = FusedMoE(
+                num_experts=num_experts,
+                top_k=top_k,
+                hidden_size=hidden_size,
+                intermediate_size=intermediate_size,
+                params_dtype=params_dtype,
+                reduce_results=True,
+                renormalize=True,
+                quant_config=quant_config,
+                tp_size=tp_size,
+                prefix=f"{prefix}.experts",
+            )
+        else:
+            self.top_k = top_k
+            self.num_experts = num_experts
+            self.experts = nn.ModuleList([
+                MixtralMLP(num_experts,
+                           hidden_size,
+                           intermediate_size,
+                           quant_config=quant_config)
+                for idx in range(num_experts)
+            ])
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape, orig_type = hidden_states.shape, hidden_states.dtype
-        hidden_states = hidden_states.view(-1, self.hidden_size).to(
-            self.params_dtype)
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(orig_shape).to(orig_type)
+        if self.use_fused_moe:
+            hidden_states = hidden_states.view(-1, self.hidden_size).to(
+                self.params_dtype)
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            final_hidden_states = self.experts(hidden_states, router_logits)
+            return final_hidden_states.view(orig_shape).to(orig_type)
+        else:
+            hidden_states = hidden_states.view(-1, self.hidden_size)
+            router_logits, _ = self.gate(hidden_states.half())
+            routing_weights = F.softmax(router_logits,
+                                        dim=1,
+                                        dtype=torch.float)
+            routing_weights, selected_experts = torch.topk(routing_weights,
+                                                           self.top_k,
+                                                           dim=-1)
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+            final_hidden_states = None
+            for expert_idx in range(self.num_experts):
+                expert_layer = self.experts[expert_idx]
+                expert_mask = (selected_experts == expert_idx)
+                expert_weights = (routing_weights * expert_mask).sum(
+                    dim=-1, keepdim=True)
+
+                current_hidden_states = expert_layer(hidden_states).mul_(
+                    expert_weights)
+                if final_hidden_states is None:
+                    final_hidden_states = current_hidden_states
+                else:
+                    final_hidden_states.add_(current_hidden_states)
+
+            return tensor_model_parallel_all_reduce(final_hidden_states).view(
+                orig_shape).to(orig_type)
 
 
 class MixtralAttention(nn.Module):
@@ -197,6 +281,7 @@ class MixtralDecoderLayer(nn.Module):
 
     def __init__(
         self,
+        use_fused_moe: bool,
         config: MixtralConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -221,6 +306,7 @@ def __init__(
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
+            use_fused_moe=use_fused_moe,
             quant_config=quant_config,
             tp_size=get_tensor_model_parallel_world_size(),
             params_dtype=torch.float16,
@@ -264,6 +350,7 @@ class MixtralModel(nn.Module):
 
     def __init__(
         self,
+        use_fused_moe: bool,
         config: MixtralConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -286,7 +373,11 @@ def __init__(
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MixtralDecoderLayer(
-                config, cache_config, quant_config=quant_config, prefix=prefix
+                use_fused_moe,
+                config,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
             ),
             prefix=f"{prefix}.layers",
         )
@@ -358,10 +449,13 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
-
+        # TODO keep the fused mixtral_quant codepath around as long as we don't
+        # support all quant_types
+        self.use_fused_moe = (quant_config.quant_type == scalar_types.uint4b8)
         self.config = config
         self.lora_config = lora_config
-        self.model = MixtralModel(config,
+        self.model = MixtralModel(self.use_fused_moe,
+                                  config,
                                   cache_config,
                                   quant_config,
                                   lora_config=lora_config,
@@ -436,65 +530,98 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts,
-        )
+        if self.use_fused_moe:
 
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="w1",
+                ckpt_down_proj_name="w2",
+                ckpt_up_proj_name="w3",
+                num_experts=self.config.num_local_experts,
+            )
 
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
+            params_dict = dict(self.named_parameters())
+            for name, loaded_weight in weights:
+                if "rotary_emb.inv_freq" in name:
                     continue
 
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
+                for param_name, weight_name, shard_id in stacked_params_mapping:
                     if weight_name not in name:
                         continue
-                    # Skip layers on other devices.
                     name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
-                        is_quantized=True,
-                    )
+                    weight_loader(param, loaded_weight, shard_id)
                     break
                 else:
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in name:
+                            continue
+                        # Skip layers on other devices.
+                        name = name.replace(weight_name, param_name)
+                        if is_pp_missing_parameter(name, self):
+                            continue
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            is_quantized=True,
+                        )
+                        break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+                        # Skip layers on other devices.
+                        if is_pp_missing_parameter(name, self):
+                            continue
+                        # Remapping the name of FP8 kv-scale.
+                        name = maybe_remap_kv_scale_name(name, params_dict)
+                        if name is None:
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = getattr(param, "weight_loader",
+                                                default_weight_loader)
+                        weight_loader(param, loaded_weight)
+        else:
+
+            params_dict = dict(self.named_parameters())
+            for name, loaded_weight in weights:
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
                         continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
+
+                    if ("block_sparse_moe.experts." in name
+                            and name not in params_dict):
                         continue
 
                     param = params_dict[name]

From 2511f78e3ae67d04df2cffb29e817ec6dcc3962c Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Wed, 28 Aug 2024 14:33:16 +0000
Subject: [PATCH 101/106] uint8b128 support

---
 csrc/moe/marlin_moe_ops.cu                    | 299 ++++++++++++------
 csrc/moe/marlin_moe_ops.h                     |   7 +-
 csrc/moe/torch_bindings.cpp                   |   9 +-
 tests/kernels/test_moe.py                     |  34 +-
 .../layers/fused_moe/__init__.py              |   4 +-
 .../layers/fused_moe/fused_moe_marlin.py      |  28 +-
 vllm/model_executor/models/mixtral.py         |   4 +-
 7 files changed, 264 insertions(+), 121 deletions(-)

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index 92184f43c9eb0..e3c18ce5a50b8 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -25,6 +25,8 @@
 
 #include <iostream>
 
+#include "core/scalar_type.hpp"
+
 template <typename T>
 inline std::string str(T x) {
   return std::to_string(x);
@@ -131,11 +133,26 @@ __device__ inline int lop3(int a, int b, int c) {
   return res;
 }
 
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant(int q) {
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <vllm::ScalarTypeId w_type_id>
+__device__ inline FragB dequant(int q);
+
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+template <>
+__device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int LO = 0x000f000f;
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
@@ -156,6 +173,28 @@ __device__ inline FragB dequant(int q) {
   return frag_b;
 }
 
+// Fast Int8ToFp16: Efficiently dequantize 8bit int values to fp16
+// Reference:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+template <>
+__device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
 __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
@@ -296,7 +335,8 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   __syncthreads();
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -331,6 +371,9 @@ __device__ inline void MarlinMoESingle(
     bool apply_weights,    // apply weights to output
     int current_m_block    // current m block to start kernel computation from
 ) {
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  constexpr int pack_factor = 32 / w_type.size_bits();
+
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
   // better partitioning with less reductions
   int parallel = 1;
@@ -423,12 +466,15 @@ __device__ inline void MarlinMoESingle(
   constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
   int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
   constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
@@ -465,12 +511,12 @@ __device__ inline void MarlinMoESingle(
       a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
   a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
 
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
 
   // For act_order
   constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@@ -571,7 +617,7 @@ __device__ inline void MarlinMoESingle(
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
+  I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
   FragS frag_s[2][4];         // No act-order
   FragS act_frag_s[2][4][4];  // For act-order
@@ -637,7 +683,10 @@ __device__ inline void MarlinMoESingle(
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
       for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
         B_ptr[i] += b_gl_rd_delta_o;
       }
 
@@ -715,8 +764,12 @@ __device__ inline void MarlinMoESingle(
     for (int i = 0; i < thread_m_blocks; i++)
       ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
   };
 
   bool is_same_group[stages];
@@ -840,10 +893,19 @@ __device__ inline void MarlinMoESingle(
   // dequantization and matmul operations.
   #pragma unroll
     for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      int b_quant_shift = b_quant >> 8;
+      int b_quant_0, b_quant_1;
+      if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k % 2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
 
-      FragB frag_b0 = dequant(b_quant);
+      FragB frag_b0 = dequant<w_type_id>(b_quant_0);
+      FragB frag_b1 = dequant<w_type_id>(b_quant_1);
 
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
@@ -855,8 +917,6 @@ __device__ inline void MarlinMoESingle(
         }
       }
 
-      FragB frag_b1 = dequant(b_quant_shift);
-
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
@@ -881,13 +941,13 @@ __device__ inline void MarlinMoESingle(
   // multiple warps that accumulate their partial sums of the same output
   // location; which we have to reduce over in the end. We do in shared memory.
   auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
 
       // Parallel logarithmic shared memory reduction. We make sure to avoid any
       // unnecessary read or write iterations, e.g., for two warps we write only
@@ -1035,8 +1095,10 @@ __device__ inline void MarlinMoESingle(
     auto write = [&](int idx, float c0, float c1, FragS& s) {
       half2 res = __halves2half2(__float2half(c0), __float2half(c1));
 
-      // For per-column quantization we finally apply the scale here
-      if constexpr (!has_act_order && group_blocks == -1) {
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4) {
         res = __hmul2(res, s[0]);
       }
 
@@ -1169,25 +1231,67 @@ __device__ inline void MarlinMoESingle(
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (w_type.size_bits() == 8) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
           cp_async_fence();
+        } else {
+          if (last) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
+            cp_async_fence();
+          }
         }
       }
 
       thread_block_reduce();
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (w_type.size_bits() == 8) {
           cp_async_wait<0>();
           __syncthreads();
           if (threadIdx.x / 32 < thread_n_blocks / 4) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
           }
+
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            }
+          }
         }
       }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+            }
+          }
+        }
+      }
+
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
@@ -1227,7 +1331,8 @@ __device__ inline void MarlinMoESingle(
   }
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1293,29 +1398,29 @@ __global__ void MarlinMoE(
   }
 
   if (max_block == 1) {
-    MarlinMoESingle<threads, 1, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 2) {
-    MarlinMoESingle<threads, 2, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 3) {
-    MarlinMoESingle<threads, 3, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else {
-    MarlinMoESingle<threads, 4, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
@@ -1342,7 +1447,8 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   return;
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1397,19 +1503,20 @@ const int STAGES = 4;  // 4 pipeline stages fit into shared memory
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
 
-#define __CALL_IF_MOE(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
-                      HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)               \
-  else if (thread_m_blocks == THREAD_M_BLOCKS &&                              \
+#define __CALL_IF_MOE(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
+                      THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS,           \
+                      NUM_THREADS)                                            \
+  else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&          \
            thread_n_blocks == THREAD_N_BLOCKS &&                              \
            thread_k_blocks == THREAD_K_BLOCKS &&                              \
            has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&  \
            num_threads == NUM_THREADS) {                                      \
     cudaFuncSetAttribute(                                                     \
-        MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,              \
+        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
                   THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,      \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
-    MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-              STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                            \
+    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,     \
+              THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>           \
         <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
             A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
             g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,            \
@@ -1494,42 +1601,43 @@ thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
   return thread_config_t{-1, -1, -1};
 }
 
-#define CALL_IF_MOE(N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                               \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+#define CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
 
 void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                          const void* sorted_ids, const void* topk_weights,
                          const void* topk_ids, const void* s, const void* g_idx,
                          const void* perm, void* a_tmp, void* expert_offsets,
                          int prob_m, int prob_n, int prob_k, void* workspace,
-                         bool has_act_order, bool is_k_full, int num_groups,
-                         int group_size, int num_experts, int topk,
-                         int moe_block_size, int dev, cudaStream_t stream,
-                         int thread_k, int thread_n, int sms, int max_par,
-                         bool replicate_input, bool apply_weights) {
+                         vllm::ScalarType const& q_type, bool has_act_order,
+                         bool is_k_full, int num_groups, int group_size,
+                         int num_experts, int topk, int moe_block_size, int dev,
+                         cudaStream_t stream, int thread_k, int thread_n,
+                         int sms, int max_par, bool replicate_input,
+                         bool apply_weights) {
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -1611,10 +1719,13 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
     has_act_order = false;
   }
 
+  int pack_factor = 32 / q_type.size_bits();
+
   for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
     const int4* A_ptr = (const int4*)A;
     int4* a_tmp_ptr = (int4*)a_tmp;
-    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
+    const int4* B_ptr =
+        (const int4*)B + (prob_n * prob_k / (pack_factor * 4)) * expert_idx;
     int4* C_ptr = (int4*)C;
     const float* topk_weights_ptr = (const float*)topk_weights;
     const int* sorted_ids_ptr = (const int*)sorted_ids;
@@ -1645,10 +1756,14 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
 
       if (false) {
       }
-      CALL_IF_MOE(16, 4, 256)
-      CALL_IF_MOE(8, 8, 256)
-      CALL_IF_MOE(8, 4, 128)
-      CALL_IF_MOE(4, 8, 128)
+      CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
+      CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
+      CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
+      CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
+      CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
+      CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
+      CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
+      CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
       else {
         TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
                                str(prob_n) + ", " + str(prob_k) + "]" +
@@ -1670,9 +1785,15 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights) {
+  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+              "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
+
+  int pack_factor = 32 / b_q_type->size_bits();
+
   int max_par = 4;
 
   int dev = a.get_device();
@@ -1733,8 +1854,8 @@ torch::Tensor marlin_gemm_moe(
       topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
       g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
       expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
-      has_act_order, is_k_full, num_groups, group_size, num_experts, topk,
-      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+      *b_q_type, has_act_order, is_k_full, num_groups, group_size, num_experts,
+      topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
       thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
 }
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index 43d264e0770d6..adee8399a4d6f 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -2,11 +2,14 @@
 
 #include <torch/all.h>
 
+#include "core/scalar_type.hpp"
+
 torch::Tensor marlin_gemm_moe(
     const torch::Tensor& a, const torch::Tensor& b_q_weights,
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 33be8d3b5f703..d2352375de33c 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -12,10 +12,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
-      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
-      "bool replicate_input, bool apply_weights) -> Tensor");
-
+      "g_idx, Tensor! perm, Tensor! workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
+      "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
+      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 }
 
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 51c54d800a761..f7642bf02b05a 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -12,7 +12,7 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-    fused_moe_marlin, single_marlin_moe)
+    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
@@ -132,11 +132,6 @@ def compute_max_diff(output, output_ref):
         torch.abs(output_ref))
 
 
-# TODO: make sure this test works
-# @pytest.mark.skip("C compiler not installed in NM automation. "
-#                   "This codepath follows a triton pathway, which "
-#                   "JITs using clang or gcc. Since neither are installed "
-#                   "in our test instances, we need to skip this for now.")
 @pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [128, 2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 1024, 512])
@@ -144,6 +139,7 @@ def compute_max_diff(output, output_ref):
 @pytest.mark.parametrize("topk", [2, 6])
 @pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
 @pytest.mark.parametrize("act_order", [True, False])
+@pytest.mark.parametrize("num_bits", [4, 8])
 def test_fused_marlin_moe(
     m: int,
     n: int,
@@ -152,6 +148,7 @@ def test_fused_marlin_moe(
     topk: int,
     group_size: int,
     act_order: bool,
+    num_bits: int,
 ):
     torch.manual_seed(7)
 
@@ -165,7 +162,8 @@ def test_fused_marlin_moe(
         if group_size in (k, n):
             return
 
-    quant_type = scalar_types.uint4b8
+    quant_type = (scalar_types.uint4b8
+                  if num_bits == 4 else scalar_types.uint8b128)
     dtype = torch.float16
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -241,18 +239,14 @@ def test_fused_marlin_moe(
         renormalize=False,
         w1_scale=scales1,
         w2_scale=scales2,
-        num_bits=4,
+        num_bits=num_bits,
     )
 
     assert compute_max_diff(marlin_output, triton_output) < 4e-2
 
 
-# TODO: make sure this test works
-# UPSTREAM SYNC: breaks NM automation.
-# @pytest.mark.skip("C compiler not installed in NM automation. "
-#                   "This codepath follows a triton pathway, which "
-#                   "JITs using clang or gcc. Since neither are installed "
-#                   "in our test instances, we need to skip this for now.")
+@pytest.mark.skip("This test is here for the sake of debugging, "
+                  "don't run it in automated tests.")
 @pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [128, 2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 1024, 512])
@@ -260,7 +254,8 @@ def test_fused_marlin_moe(
 @pytest.mark.parametrize("topk", [2, 6])
 @pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
 @pytest.mark.parametrize("act_order", [True, False])
-def test_single_marlin_moe(
+@pytest.mark.parametrize("num_bits", [4, 8])
+def test_marlin_moe_mmm(
     m: int,
     n: int,
     k: int,
@@ -268,6 +263,7 @@ def test_single_marlin_moe(
     topk: int,
     group_size: int,
     act_order: bool,
+    num_bits: int,
 ):
     if topk > e:
         return
@@ -279,7 +275,8 @@ def test_single_marlin_moe(
         if group_size == k:
             return
 
-    quant_type = scalar_types.uint4b8
+    quant_type = (scalar_types.uint4b8
+                  if num_bits == 4 else scalar_types.uint8b128)
     dtype = torch.float16
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
@@ -307,14 +304,15 @@ def test_single_marlin_moe(
     sort_indices = stack_and_dev(sort_indices_l)
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-    marlin_output = single_marlin_moe(a,
+    marlin_output = single_moe_marlin(a,
                                       qweight,
                                       scales,
                                       score,
                                       g_idx,
                                       sort_indices,
                                       topk,
-                                      renormalize=False)
+                                      renormalize=False,
+                                      num_bits=num_bits)
     torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
 
     assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 73315d8e71fcd..18c708fd3b918 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,5 @@
 from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-    fused_moe_marlin, single_marlin_moe)
+    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.triton_utils import HAS_TRITON
@@ -8,7 +8,7 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "fused_moe_marlin",
-    "single_marlin_moe",
+    "single_moe_marlin",
 ]
 
 if HAS_TRITON:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 469b20ccf24de..40f9f66f1706b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -5,12 +5,13 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.scalar_type import scalar_types
 
 from .fused_moe import (fused_topk, moe_align_block_size,
                         try_get_optimal_moe_config)
 
 
-def single_marlin_moe(
+def single_moe_marlin(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
     scales: torch.Tensor,
@@ -21,6 +22,7 @@ def single_marlin_moe(
     renormalize: bool,
     override_config: Optional[Dict[str, Any]] = None,
     use_fp8: bool = False,
+    num_bits: int = 8,
 ) -> torch.Tensor:
     """
     This function computes a Marlin MoE MMM using weights w
@@ -38,7 +40,7 @@ def single_marlin_moe(
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
     - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w and w2. Defaults to False.
+        product for w. Defaults to False.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -53,9 +55,13 @@ def single_marlin_moe(
     assert hidden_states.dtype in [
         torch.float32, torch.float16, torch.bfloat16
     ]
+    assert num_bits in [4, 8]
+    # TODO support this
+    assert not use_fp8
+
     M, K = hidden_states.shape
     E = w.shape[0]
-    N = w.shape[2] // 2
+    N = w.shape[2] // (num_bits // 2)
 
     topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
                                         renormalize)
@@ -80,10 +86,13 @@ def single_marlin_moe(
                             device="cuda",
                             requires_grad=False)
 
+    scalar_type = (scalar_types.uint4b8
+                   if num_bits == 4 else scalar_types.uint8b128)
+
     intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
         hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        g_idx, rand_perm, workspace, M, N, K, True, E, topk, block_size_m,
-        True, False)
+        g_idx, rand_perm, workspace, scalar_type, M, N, K, True, E, topk,
+        block_size_m, True, False)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
@@ -145,6 +154,10 @@ def fused_moe_marlin(
     assert hidden_states.dtype in [
         torch.float32, torch.float16, torch.bfloat16
     ]
+    assert num_bits in [4, 8]
+    # TODO support this
+    assert not use_fp8
+
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
@@ -173,6 +186,9 @@ def fused_moe_marlin(
                             device="cuda",
                             requires_grad=False)
 
+    scalar_type = (scalar_types.uint4b8
+                   if num_bits == 4 else scalar_types.uint8b128)
+
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
         device=hidden_states.device,
@@ -189,6 +205,7 @@ def fused_moe_marlin(
         g_idx1,
         rand_perm1,
         workspace,
+        scalar_type,
         M,
         2 * N,
         K,
@@ -212,6 +229,7 @@ def fused_moe_marlin(
         g_idx2,
         rand_perm2,
         workspace,
+        scalar_type,
         M,
         K,
         N,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3cb1938953506..963e3d4a5ed33 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -451,7 +451,9 @@ def __init__(
         super().__init__()
         # TODO keep the fused mixtral_quant codepath around as long as we don't
         # support all quant_types
-        self.use_fused_moe = (quant_config.quant_type == scalar_types.uint4b8)
+        self.use_fused_moe = (quant_config.quant_type == scalar_types.uint4b8
+                              or quant_config.quant_type
+                              == scalar_types.uint8b128)
         self.config = config
         self.lora_config = lora_config
         self.model = MixtralModel(self.use_fused_moe,

From d8feb8d73735ee83a58bd6bfabf0cdc469b8cd3f Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Thu, 29 Aug 2024 07:59:26 +0000
Subject: [PATCH 102/106] Cleanup, compressed tensors compatibility

---
 .../layers/fused_moe/__init__.py              |   1 -
 vllm/model_executor/layers/fused_moe/layer.py | 138 +++++-----------
 .../compressed_tensors_moe.py                 |  27 ++--
 .../layers/quantization/experts_int8.py       |   4 +-
 vllm/model_executor/models/mixtral.py         |  12 +-
 vllm/model_executor/models/mixtral_quant.py   | 152 +++++-------------
 6 files changed, 101 insertions(+), 233 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 452c1a4b40f21..65a9b78a118c3 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -2,7 +2,6 @@
     fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-
 from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index a5144b4242601..e54008cecde79 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -225,90 +225,6 @@ def __init__(
             weight_loader=self.weight_loader,
         )
 
-    # def weight_loader(
-    #     self,
-    #     param: torch.nn.Parameter,
-    #     loaded_weight: torch.Tensor,
-    #     weight_name: str,
-    #     shard_id: str,
-    #     expert_id: int,
-    #     is_quantized: bool = False,
-    # ):
-    #     param_data = param.data
-
-    #     if is_quantized:
-    #         if ("_qweight" in weight_name or "_scales" in weight_name
-    #                 or "_qzeros" in weight_name):
-    #             if "w13" in weight_name:
-    #                 shard_size = loaded_weight.size()[-1]
-    #                 if shard_id == "w1":
-    #                     param_data[expert_id, :, :shard_size] = loaded_weight
-    #                 elif shard_id == "w3" or shard_id == "w2":
-    #                     param_data[expert_id, :, shard_size:] = loaded_weight
-    #                 else:
-    #                     raise ValueError(f"Invalid shard_id: {shard_id}: "
-    #                                      "must be 0, 1, or 2.")
-    #             elif "w2" in weight_name:
-    #                 param_data[expert_id][:] = loaded_weight
-    #             else:
-    #                 raise ValueError(f"Invalid weight name: {weight_name}: "
-    #                                  "must contain 'w13' or 'w2'.")
-    #         elif "_g_idx" in weight_name:
-    #             if "w13" not in weight_name and "w2" not in weight_name:
-    #                 raise ValueError(f"Invalid weight name: {weight_name}: "
-    #                                  "must contain 'w13' or 'w2'.")
-    #             param_data[expert_id] = loaded_weight
-    #         else:
-    #             raise ValueError(f"Invalid weight name: {weight_name}.")
-    #     else:
-    #         if shard_id not in ("w1", "w2", "w3"):
-    #             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
-    #                              f"got {shard_id}.")
-
-    #         # Special case for fp8 scales.
-    #         if getattr(param, "is_fp8_scale", False):
-    #             self._load_fp8_scale(param.data, loaded_weight, weight_name,
-    #                                  shard_id, expert_id)
-    #             return
-
-    #         expert_data = param.data[expert_id]
-    #         tp_rank = get_tensor_model_parallel_rank()
-
-    #         # If transposed, weight is saved as [input_dim, output_dim]
-    #         # Otherwise, weight is saved as     [output_dim, input_dim]
-    #         # Default is not transposed/input dim is dim 1
-    #         input_dim = getattr(param, "input_dim", 1)
-    #         output_dim = getattr(param, "output_dim", 0)
-
-    #         # Index the loaded weight for tp sharding.
-    #         # down_proj: "RowParallel" so tp sharding on input_dim
-    #         if shard_id == "w2":
-    #             shard_dim = input_dim
-    #             shard_size = expert_data.shape[shard_dim]
-    #         # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
-    #         elif shard_id in ("w1", "w3"):
-    #             shard_dim = output_dim
-    #             shard_size = expert_data.shape[output_dim] // 2
-    #         offset = shard_size * tp_rank
-    #         loaded_weight = loaded_weight.narrow(shard_dim, offset, shard_size)
-
-    #         # Narrow parameter and load.
-    #         # w1, gate_proj: Load into first logical weight of w13.
-    #         if shard_id == "w1":
-    #             expert_data = expert_data.narrow(shard_dim, 0, shard_size)
-    #             expert_data.copy_(loaded_weight)
-    #         # w3, up_proj: Load into second logical weight of w13.
-    #         elif shard_id == "w3":
-    #             expert_data = expert_data.narrow(shard_dim, shard_size,
-    #                                              shard_size)
-    #             expert_data.copy_(loaded_weight)
-    #         # w2, down_proj: Load into only logical weight of w2.
-    #         elif shard_id == "w2":
-    #             expert_data.copy_(loaded_weight)
-    #         else:
-    #             raise ValueError(
-    #                 f"Expected shard_id w1, w2 or w3 but got {shard_id}")
-
     def _load_per_tensor_weight_scale(self, shard_id: str,
                                       param: torch.nn.Parameter,
                                       loaded_weight: torch.Tensor,
@@ -395,9 +311,41 @@ def _load_single_value(self, param: torch.nn.Parameter,
         # Input scales can be loaded directly and should be equal.
         param_data[expert_id] = loaded_weight
 
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, weight_name: str,
-                      shard_id: str, expert_id: int) -> None:
+    def weight_loader(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
+        is_gptq: bool = False,
+    ):
+        if is_gptq:
+            param_data = param.data
+            if ("_qweight" in weight_name or "_scales" in weight_name
+                    or "_qzeros" in weight_name):
+                if "w13" in weight_name:
+                    shard_size = loaded_weight.size()[-1]
+                    if shard_id == "w1":
+                        param_data[expert_id, :, :shard_size] = loaded_weight
+                    elif shard_id == "w3" or shard_id == "w2":
+                        param_data[expert_id, :, shard_size:] = loaded_weight
+                    else:
+                        raise ValueError(f"Invalid shard_id: {shard_id}: "
+                                         "must be 0, 1, or 2.")
+                elif "w2" in weight_name:
+                    param_data[expert_id][:] = loaded_weight
+                else:
+                    raise ValueError(f"Invalid weight name: {weight_name}: "
+                                     "must contain 'w13' or 'w2'.")
+            elif "_g_idx" in weight_name:
+                if "w13" not in weight_name and "w2" not in weight_name:
+                    raise ValueError(f"Invalid weight name: {weight_name}: "
+                                     "must contain 'w13' or 'w2'.")
+                param_data[expert_id] = loaded_weight
+            else:
+                raise ValueError(f"Invalid weight name: {weight_name}.")
+            return
 
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
@@ -550,8 +498,8 @@ def make_expert_params_mapping(
             # These are the weight scales for the experts
             # (param_name, weight_name, expert_id, shard_id)
             (
-                "experts.w13_scale"
-                if weight_name in gate_up else "experts.w2_scale",
+                "experts.w13_weight_scale"
+                if weight_name in gate_up else "experts.w2_weight_scale",
                 f"experts.{expert_id}.{weight_name}.weight_scale",
                 expert_id,
                 f"w{shard_id + 1}",
@@ -625,18 +573,6 @@ def make_expert_params_mapping(
             for shard_id, weight_name in enumerate(gate_down_up)
         ])
 
-        # return [
-        #     # (param_name, weight_name, expert_id, shard_id)
-        #     ("experts.w13_" if weight_name
-        #      in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
-        #      f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
-        #     for expert_id in range(num_experts) for shard_id, weight_name in [
-        #         ("w1", ckpt_gate_proj_name),
-        #         ("w2", ckpt_down_proj_name),
-        #         ("w3", ckpt_up_proj_name),
-        #     ]
-        # ]
-
     def _load_fp8_scale(self, param: torch.nn.Parameter,
                         loaded_weight: torch.Tensor, weight_name: str,
                         shard_id: str, expert_id: int) -> None:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 448de19971b41..ba4f719a3f97f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -269,15 +269,18 @@ def apply(self,
         from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
             fused_moe_marlin)
 
-        return fused_moe_marlin(x,
-                                layer.w13_weight_packed,
-                                layer.w2_weight_packed,
-                                router_logits,
-                                layer.w13_g_idx,
-                                layer.w2_g_idx,
-                                layer.w13_g_idx_sort_indices,
-                                layer.w2_g_idx_sort_indices,
-                                top_k,
-                                renormalize=renormalize,
-                                w1_scale=layer.w13_weight_scale,
-                                w2_scale=layer.w2_weight_scale)
+        return fused_moe_marlin(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            top_k,
+            renormalize=renormalize,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            num_bits=self.num_bits,
+        )
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 153bccc303ef1..6a2ed7704d13f 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -88,13 +88,13 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                                                    2 * intermediate_size,
                                                    dtype=torch.float32),
                                        requires_grad=False)
-        layer.register_parameter("w13_scale", w13_scale)
+        layer.register_parameter("w13_weight_scale", w13_scale)
 
         w2_scale = torch.nn.Parameter(torch.zeros(num_experts,
                                                   hidden_size,
                                                   dtype=torch.float32),
                                       requires_grad=False)
-        layer.register_parameter("w2_scale", w2_scale)
+        layer.register_parameter("w2_weight_scale", w2_scale)
 
     def apply(self,
               layer: torch.nn.Module,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 56685c872c447..d3471959a1766 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -42,6 +42,8 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -450,9 +452,11 @@ def __init__(
         super().__init__()
         # TODO keep the fused mixtral_quant codepath around as long as we don't
         # support all quant_types
-        self.use_fused_moe = (quant_config.quant_type == scalar_types.uint4b8
-                              or quant_config.quant_type
-                              == scalar_types.uint8b128)
+        self.is_compressed = isinstance(quant_config, CompressedTensorsConfig)
+        self.use_fused_moe = (
+            self.is_compressed
+            or quant_config.quant_type == scalar_types.uint4b8
+            or quant_config.quant_type == scalar_types.uint8b128)
         self.config = config
         self.lora_config = lora_config
         self.model = MixtralModel(self.use_fused_moe,
@@ -579,7 +583,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                             name,
                             shard_id=shard_id,
                             expert_id=expert_id,
-                            # is_quantized=True,
+                            is_gptq=not self.is_compressed,
                         )
                         break
                     else:
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index cd8890b15303b..8bdd52b343175 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -21,8 +21,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-import logging
-import re
 from typing import Iterable, List, Optional, Tuple
 
 import numpy as np
@@ -36,7 +34,6 @@
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
@@ -52,8 +49,6 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
-logger = logging.getLogger(__name__)
-
 
 class MixtralMLP(nn.Module):
 
@@ -99,13 +94,10 @@ class MixtralMoE(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        use_fused_moe: bool,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.config = config
-        self.use_fused_moe = use_fused_moe
-        self.quant_config = quant_config
         self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_total_experts = config.num_local_experts
@@ -121,27 +113,14 @@ def __init__(
             raise ValueError(
                 f"Rank {self.rank} has no experts assigned to it.")
 
-        if self.use_fused_moe:
-            params_dtype = torch.float16
-            self.experts = FusedMoE(num_experts=self.num_total_experts,
-                                    top_k=self.top_k,
-                                    hidden_size=config.hidden_size,
-                                    intermediate_size=config.intermediate_size,
-                                    params_dtype=params_dtype,
-                                    reduce_results=True,
-                                    renormalize=True,
-                                    quant_config=quant_config,
-                                    tp_size=self.tp_size)
-        else:
-            self.experts = nn.ModuleList([
-                MixtralMLP(self.num_total_experts,
-                           config.hidden_size,
-                           config.intermediate_size,
-                           quant_config=quant_config)
-                if idx in self.expert_indicies else None
-                for idx in range(self.num_total_experts)
-            ])
-
+        self.experts = nn.ModuleList([
+            MixtralMLP(self.num_total_experts,
+                       config.hidden_size,
+                       config.intermediate_size,
+                       quant_config=quant_config)
+            if idx in self.expert_indicies else None
+            for idx in range(self.num_total_experts)
+        ])
         self.gate = ReplicatedLinear(config.hidden_size,
                                      self.num_total_experts,
                                      bias=False,
@@ -150,36 +129,31 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        if self.use_fused_moe:
-            ret = self.experts(hidden_states.half(), router_logits)
-            return ret.bfloat16()
-        else:
-            routing_weights = F.softmax(router_logits,
-                                        dim=1,
-                                        dtype=torch.float)
-            routing_weights, selected_experts = torch.topk(routing_weights,
-                                                           self.top_k,
-                                                           dim=-1)
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-            final_hidden_states = None
-            for expert_idx in self.expert_indicies:
-                expert_layer = self.experts[expert_idx]
-                expert_mask = (selected_experts == expert_idx)
-                expert_weights = (routing_weights * expert_mask).sum(
-                    dim=-1, keepdim=True)
-
-                current_hidden_states = expert_layer(hidden_states).mul_(
-                    expert_weights)
-                if final_hidden_states is None:
-                    final_hidden_states = current_hidden_states
-                else:
-                    final_hidden_states.add_(current_hidden_states)
-
-            return tensor_model_parallel_all_reduce(final_hidden_states).view(
-                num_tokens, hidden_dim)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights,
+                                                       self.top_k,
+                                                       dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        final_hidden_states = None
+        for expert_idx in self.expert_indicies:
+            expert_layer = self.experts[expert_idx]
+            expert_mask = (selected_experts == expert_idx)
+            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
+                                                                 keepdim=True)
+
+            current_hidden_states = expert_layer(hidden_states).mul_(
+                expert_weights)
+            if final_hidden_states is None:
+                final_hidden_states = current_hidden_states
+            else:
+                final_hidden_states.add_(current_hidden_states)
+
+        return tensor_model_parallel_all_reduce(final_hidden_states).view(
+            num_tokens, hidden_dim)
 
 
 class MixtralAttention(nn.Module):
@@ -264,7 +238,6 @@ class MixtralDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -281,7 +254,6 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config)
         self.block_sparse_moe = MixtralMoE(config=config,
-                                           use_fused_moe=use_fused_moe,
                                            quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -322,7 +294,6 @@ class MixtralModel(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -336,7 +307,6 @@ def __init__(
         )
         self.layers = nn.ModuleList([
             MixtralDecoderLayer(config,
-                                use_fused_moe,
                                 cache_config,
                                 quant_config=quant_config)
             for _ in range(config.num_hidden_layers)
@@ -371,13 +341,9 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
-
-        # TODO check runs with dtype=float16
-        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
         self.config = config
         self.quant_config = quant_config
-        self.model = MixtralModel(config, self.use_fused_moe, cache_config,
-                                  quant_config)
+        self.model = MixtralModel(config, cache_config, quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -442,51 +408,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-
-                if self.use_fused_moe:
-                    if ("block_sparse_moe.experts." in name
-                            and ".w1." not in name and ".w2." not in name
-                            and ".w3." not in name
-                            and name not in params_dict):
-                        continue
-
-                    if (".qzeros" in name):
-                        continue
-
-                    shard_id = None
-                    expert_id = 0
-
-                    has_any_numbered = (".qweight" in name or ".scales" in name
-                                        or ".g_idx" in name)
-                    if (has_any_numbered and (".w1." in name)):
-                        name = name.replace(".w1.", ".w13_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w2." in name)):
-                        name = name.replace(".w2.", ".w2_")
-                        shard_id = 0
-                    if (has_any_numbered and (".w3." in name)):
-                        name = name.replace(".w3.", ".w13_")
-                        shard_id = 1
-
-                    exp_string = re.search(r"\.experts\.\d+.", name)
-                    if exp_string:
-                        exp_string = exp_string.group(0)
-                        expert_id = int(exp_string.split(".")[2])
-                        name = name.replace(exp_string, ".experts.")
-
-                else:
-                    if ("block_sparse_moe.experts." in name
-                            and name not in params_dict):
-                        continue
-
+                # Skip experts that are not assigned to this worker.
+                if ("block_sparse_moe.experts." in name
+                        and name not in params_dict):
+                    continue
                 param = params_dict[name]
-
-                if self.use_fused_moe and shard_id is not None:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight, name, shard_id,
-                                  expert_id, True)
-                else:
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From 3676621d4a4901a8cb2bdac6d8521b9aa9eaf11e Mon Sep 17 00:00:00 2001
From: Eliza Wszola <eliza@neuralmagic.com>
Date: Thu, 29 Aug 2024 08:00:54 +0000
Subject: [PATCH 103/106] update todo

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index d3471959a1766..1cdf1c5f18ca5 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -450,8 +450,8 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
-        # TODO keep the fused mixtral_quant codepath around as long as we don't
-        # support all quant_types
+        # TODO keep the unfused mixtral_quant-like codepath around as long as
+        # we don't support all quant_types
         self.is_compressed = isinstance(quant_config, CompressedTensorsConfig)
         self.use_fused_moe = (
             self.is_compressed

From 75e3dd5dfe552d3bb0aece61d6fe7026d6573437 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:33:51 -0400
Subject: [PATCH 104/106] Fix merge

---
 CMakeLists.txt              | 3 +--
 csrc/moe/torch_bindings.cpp | 2 ++
 vllm/_custom_ops.py         | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0e729f356613..5b0d0ba904c32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -294,8 +294,7 @@ define_gpu_extension_target(
 
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
-  "csrc/moe/topk_softmax_kernels.cu"
-  "csrc/moe/marlin_moe_ops.cu")
+  "csrc/moe/topk_softmax_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d2352375de33c..e4fce091d24a3 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -9,6 +9,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "token_expert_indices, Tensor gating_output) -> ()");
   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
 
+#ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
@@ -19,5 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 }
+#endif
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 8331f67fbcd9f..35a3a32470f02 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -313,6 +313,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            size_k: int, size_n: int,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
     output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)

From a5f5a74eaaa43f31884314520de01168820639a7 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:46:42 -0400
Subject: [PATCH 105/106] bad paste

---
 csrc/moe/torch_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index e4fce091d24a3..cd65a8ee92b94 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "int moe_block_size, bool replicate_input, bool apply_weights)"
       " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
-}
 #endif
+}
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

From e305306fe2ad363d1cc68722cb4d07080f71dc65 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Mon, 2 Sep 2024 10:46:19 -0400
Subject: [PATCH 106/106] GPTQFusedMoE layer

---
 .../layers/fused_moe/__init__.py              |   3 +-
 vllm/model_executor/layers/fused_moe/layer.py | 292 +++++++++-------
 .../layers/quantization/gptq_marlin.py        |   6 +-
 vllm/model_executor/models/__init__.py        |   2 +-
 vllm/model_executor/models/mixtral.py         | 323 +++++-------------
 vllm/model_executor/models/mixtral_quant.py   | 143 +++++---
 6 files changed, 364 insertions(+), 405 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 65a9b78a118c3..3e94406072482 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,13 +1,14 @@
 from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
     fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, GPTQFusedMoE)
 from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
+    "GPTQFusedMoE",
     "fused_moe_marlin",
     "single_moe_marlin",
 ]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index e54008cecde79..9882410af1f2c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -311,41 +311,9 @@ def _load_single_value(self, param: torch.nn.Parameter,
         # Input scales can be loaded directly and should be equal.
         param_data[expert_id] = loaded_weight
 
-    def weight_loader(
-        self,
-        param: torch.nn.Parameter,
-        loaded_weight: torch.Tensor,
-        weight_name: str,
-        shard_id: str,
-        expert_id: int,
-        is_gptq: bool = False,
-    ):
-        if is_gptq:
-            param_data = param.data
-            if ("_qweight" in weight_name or "_scales" in weight_name
-                    or "_qzeros" in weight_name):
-                if "w13" in weight_name:
-                    shard_size = loaded_weight.size()[-1]
-                    if shard_id == "w1":
-                        param_data[expert_id, :, :shard_size] = loaded_weight
-                    elif shard_id == "w3" or shard_id == "w2":
-                        param_data[expert_id, :, shard_size:] = loaded_weight
-                    else:
-                        raise ValueError(f"Invalid shard_id: {shard_id}: "
-                                         "must be 0, 1, or 2.")
-                elif "w2" in weight_name:
-                    param_data[expert_id][:] = loaded_weight
-                else:
-                    raise ValueError(f"Invalid weight name: {weight_name}: "
-                                     "must contain 'w13' or 'w2'.")
-            elif "_g_idx" in weight_name:
-                if "w13" not in weight_name and "w2" not in weight_name:
-                    raise ValueError(f"Invalid weight name: {weight_name}: "
-                                     "must contain 'w13' or 'w2'.")
-                param_data[expert_id] = loaded_weight
-            else:
-                raise ValueError(f"Invalid weight name: {weight_name}.")
-            return
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: str, expert_id: int) -> None:
 
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
@@ -476,8 +444,8 @@ def forward(self, hidden_states: torch.Tensor,
             top_k=self.top_k,
             renormalize=self.renormalize,
             use_grouped_topk=self.use_grouped_topk,
-            num_expert_group=self.num_expert_group,
-            topk_group=self.topk_group)
+            topk_group=self.topk_group,
+            num_expert_group=self.num_expert_group)
 
         if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -490,88 +458,18 @@ def make_expert_params_mapping(
             cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
             ckpt_up_proj_name: str,
             num_experts: int) -> List[Tuple[str, str, int, str]]:
-        gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
-        gate_down_up = [
-            ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
-        ]
-        return ([
-            # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.w13_weight_scale"
-                if weight_name in gate_up else "experts.w2_weight_scale",
-                f"experts.{expert_id}.{weight_name}.weight_scale",
-                expert_id,
-                f"w{shard_id + 1}",
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the weights for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.w13_weight"
-                if weight_name in gate_up else "experts.w2_weight",
-                f"experts.{expert_id}.{weight_name}.weight",
-                expert_id,
-                f"w{shard_id + 1}",
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the weights for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.w13_scales"
-                if weight_name in gate_up else "experts.w2_scales",
-                f"experts.{expert_id}.{weight_name}.scales",
-                expert_id,
-                f"w{shard_id + 1}",
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.a13_scale"
-                if weight_name in gate_up else "experts.a2_scale",
-                f"experts.{expert_id}.{weight_name}.input_scale",
-                expert_id,
-                f"a{shard_id + 1}",
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the qweights for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.w13_qweight"
-                if weight_name in gate_up else "experts.w2_qweight",
-                f"experts.{expert_id}.{weight_name}.qweight",
-                expert_id,
-                f"w{shard_id + 1}",
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the g_idx and g_idx_sort_indices scales for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.w13_g_idx"
-                if weight_name in gate_up else "experts.w2_g_idx",
-                f"experts.{expert_id}.{weight_name}.g_idx",
-                expert_id,
-                f"w{shard_id + 1}",
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the g_idx and g_idx_sort_indices scales for the experts
+
+        return [
             # (param_name, weight_name, expert_id, shard_id)
-            (
-                "experts.w13_qzeros"
-                if weight_name in gate_up else "experts.w2_qzeros",
-                f"experts.{expert_id}.{weight_name}.qzeros",
-                expert_id,
-                f"w{shard_id + 1}",
-            ) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ])
+            ("experts.w13_" if weight_name
+             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
+             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
+            for expert_id in range(num_experts) for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]
 
     def _load_fp8_scale(self, param: torch.nn.Parameter,
                         loaded_weight: torch.Tensor, weight_name: str,
@@ -597,4 +495,160 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
                 param_data[expert_id][idx] = loaded_weight
             # If we are in the row parallel case (down_proj)
             else:
-                param_data[expert_id] = loaded_weight
\ No newline at end of file
+                param_data[expert_id] = loaded_weight
+
+
+class GPTQFusedMoE(torch.nn.Module):
+    """GPTQFusedMoE layer for GPTQ MoE models.
+
+    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    w13) and RowParallelLinear weights (down_proj/ w2).
+
+    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+    copy that naming convention here and handle any remapping in the
+    load_weights function in each model implementation.
+
+    Args:
+        num_experts: Number of experts in the model
+        top_k: Number of experts selected for each token
+        hidden_size: Input hidden state size of the transformer
+        intermediate_size: Intermediate size of the experts
+        params_dtype: Data type for the parameters.
+        reduce_results: Whether to all all_reduce on the output of the layer
+        renomalize: Whether to renormalize the logits in the fused_moe kernel
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        self.tp_size = (tp_size if tp_size is not None else
+                        get_tensor_model_parallel_world_size())
+        self.top_k = top_k
+        self.num_experts = num_experts
+        self.intermediate_size = intermediate_size
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
+        assert (not use_grouped_topk and num_expert_group is None
+                and topk_group is None)
+
+        if quant_config is None:
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedFusedMoEMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix)
+        assert self.quant_method is not None
+
+        self.quant_method.create_weights(
+            layer=self,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=self.intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader,
+        )
+
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: str, expert_id: int) -> None:
+
+        if ("_qweight" in weight_name or "_scales" in weight_name
+                or "_qzeros" in weight_name):
+            if "w13" in weight_name:
+                shard_size = loaded_weight.size()[-1]
+                if shard_id == "w1":
+                    param.data[expert_id, :, :shard_size] = loaded_weight
+                elif shard_id == "w2" or shard_id == "w3":
+                    param.data[expert_id, :, shard_size:] = loaded_weight
+                else:
+                    raise ValueError(f"Invalid shard_id: {shard_id}: "
+                                     "must be w1, w2, or w3.")
+            elif "w2" in weight_name:
+                param.data[expert_id][:] = loaded_weight
+            else:
+                raise ValueError(f"Invalid weight name: {weight_name}: "
+                                 "must contain 'w13' or 'w2'.")
+        elif "_g_idx" in weight_name:
+            if "w13" not in weight_name and "w2" not in weight_name:
+                raise ValueError(f"Invalid weight name: {weight_name}: "
+                                 "must contain 'w13' or 'w2'.")
+            param.data[expert_id] = loaded_weight
+        else:
+            raise ValueError(f"Invalid weight name: {weight_name}.")
+
+    @staticmethod
+    def select_experts(hidden_states: torch.Tensor,
+                       router_logits: torch.Tensor,
+                       top_k: int,
+                       use_grouped_topk: bool,
+                       renormalize: bool,
+                       topk_group: Optional[int] = None,
+                       num_expert_group: Optional[int] = None):
+        assert (not use_grouped_topk and topk_group is None
+                and num_expert_group is None)
+        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+
+        topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
+                                            gating_output=router_logits,
+                                            topk=top_k,
+                                            renormalize=renormalize)
+
+        return topk_weights, topk_ids
+
+    def forward(self, hidden_states: torch.Tensor,
+                router_logits: torch.Tensor):
+        assert self.quant_method is not None
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=False,
+            topk_group=False,
+            num_expert_group=False)
+
+        if self.reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states
+
+    @classmethod
+    def make_expert_params_mapping(
+            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
+            ckpt_up_proj_name: str,
+            num_experts: int) -> List[Tuple[str, str, int, str]]:
+
+        return [
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_" if weight_name
+             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
+             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
+            for expert_id in range(num_experts) for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 5e5fb64af8c32..ceaca8981ea08 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -9,8 +9,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
     fused_moe_marlin)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
+                                                        GPTQFusedMoE)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -134,7 +134,7 @@ def get_quant_method(
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
-        elif isinstance(layer, FusedMoE):
+        elif isinstance(layer, GPTQFusedMoE):
             return GPTQMarlinMoEMethod(self)
         return None
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index a52b75074e4a5..8591c276b0013 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -37,7 +37,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 1cdf1c5f18ca5..413783ba4b259 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -21,19 +21,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-import logging
 from typing import Iterable, List, Optional, Tuple
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 from transformers import MixtralConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -42,8 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -51,53 +45,11 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.scalar_type import scalar_types
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
 
-logger = logging.getLogger(__name__)
-
-
-class MixtralMLP(nn.Module):
-
-    def __init__(
-        self,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.num_experts = num_experts
-        self.ffn_dim = intermediate_size
-        self.hidden_dim = hidden_size
-
-        self.w1 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w2 = ReplicatedLinear(self.ffn_dim,
-                                   self.hidden_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w3 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-
-        # TODO: Use vllm's SiluAndMul
-        self.act_fn = nn.SiLU()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        w1_out, _ = self.w1(hidden_states)
-        w1_out = self.act_fn(w1_out)
-        w3_out, _ = self.w3(hidden_states)
-        current_hidden_states = w1_out * w3_out
-        current_hidden_states, _ = self.w2(current_hidden_states)
-        return current_hidden_states
-
 
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
@@ -108,21 +60,18 @@ class MixtralMoE(nn.Module):
     across ranks.
     """
 
-    def __init__(
-        self,
-        num_experts: int,
-        top_k: int,
-        hidden_size: int,
-        intermediate_size: int,
-        use_fused_moe: bool,
-        params_dtype: Optional[torch.dtype] = torch.float16,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
-        prefix: str = "",
-    ):
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None,
+                 prefix: str = ""):
         super().__init__()
         self.hidden_size = hidden_size
-        self.params_dtype = params_dtype
+
         # Gate always runs at half / full precision for now.
 
         self.gate = ReplicatedLinear(hidden_size,
@@ -132,68 +81,25 @@ def __init__(
                                      quant_config=None,
                                      prefix=f"{prefix}.gate")
 
-        self.use_fused_moe = use_fused_moe
-        if self.use_fused_moe:
-            self.experts = FusedMoE(
-                num_experts=num_experts,
-                top_k=top_k,
-                hidden_size=hidden_size,
-                intermediate_size=intermediate_size,
-                params_dtype=params_dtype,
-                reduce_results=True,
-                renormalize=True,
-                quant_config=quant_config,
-                tp_size=tp_size,
-                prefix=f"{prefix}.experts",
-            )
-        else:
-            self.top_k = top_k
-            self.num_experts = num_experts
-            self.experts = nn.ModuleList([
-                MixtralMLP(num_experts,
-                           hidden_size,
-                           intermediate_size,
-                           quant_config=quant_config)
-                for idx in range(num_experts)
-            ])
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                tp_size=tp_size,
+                                prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
-        orig_shape, orig_type = hidden_states.shape, hidden_states.dtype
-        if self.use_fused_moe:
-            hidden_states = hidden_states.view(-1, self.hidden_size).to(
-                self.params_dtype)
-            # router_logits: (num_tokens, n_experts)
-            router_logits, _ = self.gate(hidden_states)
-            final_hidden_states = self.experts(hidden_states, router_logits)
-            return final_hidden_states.view(orig_shape).to(orig_type)
-        else:
-            hidden_states = hidden_states.view(-1, self.hidden_size)
-            router_logits, _ = self.gate(hidden_states.half())
-            routing_weights = F.softmax(router_logits,
-                                        dim=1,
-                                        dtype=torch.float)
-            routing_weights, selected_experts = torch.topk(routing_weights,
-                                                           self.top_k,
-                                                           dim=-1)
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-            final_hidden_states = None
-            for expert_idx in range(self.num_experts):
-                expert_layer = self.experts[expert_idx]
-                expert_mask = (selected_experts == expert_idx)
-                expert_weights = (routing_weights * expert_mask).sum(
-                    dim=-1, keepdim=True)
-
-                current_hidden_states = expert_layer(hidden_states).mul_(
-                    expert_weights)
-                if final_hidden_states is None:
-                    final_hidden_states = current_hidden_states
-                else:
-                    final_hidden_states.add_(current_hidden_states)
-
-            return tensor_model_parallel_all_reduce(final_hidden_states).view(
-                orig_shape).to(orig_type)
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
 
 
 class MixtralAttention(nn.Module):
@@ -254,14 +160,12 @@ def __init__(
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = Attention(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            num_kv_heads=self.num_kv_heads,
-            cache_config=cache_config,
-            quant_config=quant_config,
-        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
 
     def forward(
         self,
@@ -282,7 +186,6 @@ class MixtralDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        use_fused_moe: bool,
         config: MixtralConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -300,19 +203,14 @@ def __init__(
             rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            prefix=f"{prefix}.self_attn",
-        )
+            prefix=f"{prefix}.self_attn")
         self.block_sparse_moe = MixtralMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
-            use_fused_moe=use_fused_moe,
             quant_config=quant_config,
-            tp_size=get_tensor_model_parallel_world_size(),
-            params_dtype=torch.float16,
-            prefix=f"{prefix}.block_sparse_moe",
-        )
+            prefix=f"{prefix}.block_sparse_moe")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -351,7 +249,6 @@ class MixtralModel(nn.Module):
 
     def __init__(
         self,
-        use_fused_moe: bool,
         config: MixtralConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -360,8 +257,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
-        lora_vocab = ((lora_config.lora_extra_vocab_size *
-                       (lora_config.max_loras or 1)) if lora_config else 0)
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
 
@@ -374,14 +271,9 @@ def __init__(
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MixtralDecoderLayer(
-                use_fused_moe,
-                config,
-                cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
+                config, cache_config, quant_config=quant_config, prefix=prefix
             ),
-            prefix=f"{prefix}.layers",
-        )
+            prefix=f"{prefix}.layers")
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -402,13 +294,9 @@ def forward(
             residual = intermediate_tensors["residual"]
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-                residual,
-            )
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -450,17 +338,11 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
-        # TODO keep the unfused mixtral_quant-like codepath around as long as
-        # we don't support all quant_types
-        self.is_compressed = isinstance(quant_config, CompressedTensorsConfig)
-        self.use_fused_moe = (
-            self.is_compressed
-            or quant_config.quant_type == scalar_types.uint4b8
-            or quant_config.quant_type == scalar_types.uint8b128)
+
         self.config = config
         self.lora_config = lora_config
-        self.model = MixtralModel(self.use_fused_moe,
-                                  config,
+
+        self.model = MixtralModel(config,
                                   cache_config,
                                   quant_config,
                                   lora_config=lora_config,
@@ -535,98 +417,61 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        if self.use_fused_moe:
-
-            # Params for weights, fp8 weight scales, fp8 activation scales
-            # (param_name, weight_name, expert_id, shard_id)
-            expert_params_mapping = FusedMoE.make_expert_params_mapping(
-                ckpt_gate_proj_name="w1",
-                ckpt_down_proj_name="w2",
-                ckpt_up_proj_name="w3",
-                num_experts=self.config.num_local_experts,
-            )
-
-            params_dict = dict(self.named_parameters())
-            for name, loaded_weight in weights:
-                if "rotary_emb.inv_freq" in name:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
                     continue
 
-                for param_name, weight_name, shard_id in stacked_params_mapping:
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
-
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
                     break
                 else:
-                    for mapping in expert_params_mapping:
-                        param_name, weight_name, expert_id, shard_id = mapping
-                        if weight_name not in name:
-                            continue
-                        # Skip layers on other devices.
-                        name = name.replace(weight_name, param_name)
-                        if is_pp_missing_parameter(name, self):
-                            continue
-                        param = params_dict[name]
-                        weight_loader = param.weight_loader
-                        weight_loader(
-                            param,
-                            loaded_weight,
-                            name,
-                            shard_id=shard_id,
-                            expert_id=expert_id,
-                            is_gptq=not self.is_compressed,
-                        )
-                        break
-                    else:
-                        # Skip loading extra bias for GPTQ models.
-                        if name.endswith(".bias") and name not in params_dict:
-                            continue
-                        # Skip layers on other devices.
-                        if is_pp_missing_parameter(name, self):
-                            continue
-                        # Remapping the name of FP8 kv-scale.
-                        name = maybe_remap_kv_scale_name(name, params_dict)
-                        if name is None:
-                            continue
-
-                        param = params_dict[name]
-                        weight_loader = getattr(param, "weight_loader",
-                                                default_weight_loader)
-                        weight_loader(param, loaded_weight)
-        else:
-
-            params_dict = dict(self.named_parameters())
-            for name, loaded_weight in weights:
-                if "rotary_emb.inv_freq" in name:
-                    continue
-                for (param_name, weight_name,
-                     shard_id) in stacked_params_mapping:
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
                         continue
-
-                    if ("block_sparse_moe.experts." in name
-                            and name not in params_dict):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
                         continue
 
                     param = params_dict[name]
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 8bdd52b343175..b5b91c02e0ac6 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -21,6 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
+import logging
 from typing import Iterable, List, Optional, Tuple
 
 import numpy as np
@@ -34,6 +35,7 @@
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.fused_moe import GPTQFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
@@ -49,6 +51,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
+logger = logging.getLogger(__name__)
+
 
 class MixtralMLP(nn.Module):
 
@@ -94,10 +98,13 @@ class MixtralMoE(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.config = config
+        self.use_fused_moe = use_fused_moe
+        self.quant_config = quant_config
         self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_total_experts = config.num_local_experts
@@ -113,14 +120,28 @@ def __init__(
             raise ValueError(
                 f"Rank {self.rank} has no experts assigned to it.")
 
-        self.experts = nn.ModuleList([
-            MixtralMLP(self.num_total_experts,
-                       config.hidden_size,
-                       config.intermediate_size,
-                       quant_config=quant_config)
-            if idx in self.expert_indicies else None
-            for idx in range(self.num_total_experts)
-        ])
+        if self.use_fused_moe:
+            params_dtype = torch.float16
+            self.experts = GPTQFusedMoE(
+                num_experts=self.num_total_experts,
+                top_k=self.top_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                params_dtype=params_dtype,
+                reduce_results=True,
+                renormalize=True,
+                quant_config=quant_config,
+                tp_size=self.tp_size)
+        else:
+            self.experts = nn.ModuleList([
+                MixtralMLP(self.num_total_experts,
+                           config.hidden_size,
+                           config.intermediate_size,
+                           quant_config=quant_config)
+                if idx in self.expert_indicies else None
+                for idx in range(self.num_total_experts)
+            ])
+
         self.gate = ReplicatedLinear(config.hidden_size,
                                      self.num_total_experts,
                                      bias=False,
@@ -129,31 +150,36 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-        final_hidden_states = None
-        for expert_idx in self.expert_indicies:
-            expert_layer = self.experts[expert_idx]
-            expert_mask = (selected_experts == expert_idx)
-            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
-                                                                 keepdim=True)
-
-            current_hidden_states = expert_layer(hidden_states).mul_(
-                expert_weights)
-            if final_hidden_states is None:
-                final_hidden_states = current_hidden_states
-            else:
-                final_hidden_states.add_(current_hidden_states)
-
-        return tensor_model_parallel_all_reduce(final_hidden_states).view(
-            num_tokens, hidden_dim)
+        if self.use_fused_moe:
+            ret = self.experts(hidden_states.half(), router_logits)
+            return ret.bfloat16()
+        else:
+            routing_weights = F.softmax(router_logits,
+                                        dim=1,
+                                        dtype=torch.float)
+            routing_weights, selected_experts = torch.topk(routing_weights,
+                                                           self.top_k,
+                                                           dim=-1)
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+            final_hidden_states = None
+            for expert_idx in self.expert_indicies:
+                expert_layer = self.experts[expert_idx]
+                expert_mask = (selected_experts == expert_idx)
+                expert_weights = (routing_weights * expert_mask).sum(
+                    dim=-1, keepdim=True)
+
+                current_hidden_states = expert_layer(hidden_states).mul_(
+                    expert_weights)
+                if final_hidden_states is None:
+                    final_hidden_states = current_hidden_states
+                else:
+                    final_hidden_states.add_(current_hidden_states)
+
+            return tensor_model_parallel_all_reduce(final_hidden_states).view(
+                num_tokens, hidden_dim)
 
 
 class MixtralAttention(nn.Module):
@@ -238,6 +264,7 @@ class MixtralDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -254,6 +281,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config)
         self.block_sparse_moe = MixtralMoE(config=config,
+                                           use_fused_moe=use_fused_moe,
                                            quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -294,6 +322,7 @@ class MixtralModel(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
+        use_fused_moe: bool,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -307,6 +336,7 @@ def __init__(
         )
         self.layers = nn.ModuleList([
             MixtralDecoderLayer(config,
+                                use_fused_moe,
                                 cache_config,
                                 quant_config=quant_config)
             for _ in range(config.num_hidden_layers)
@@ -341,14 +371,15 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
+
+        self.use_fused_moe = (config.torch_dtype != torch.float8_e4m3fn)
         self.config = config
         self.quant_config = quant_config
-        self.model = MixtralModel(config, cache_config, quant_config)
+        self.model = MixtralModel(config, self.use_fused_moe, cache_config,
+                                  quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -389,6 +420,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
 
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = GPTQFusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -408,11 +447,31 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip experts that are not assigned to this worker.
-                if ("block_sparse_moe.experts." in name
-                        and name not in params_dict):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+
+                if self.use_fused_moe:
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param, loaded_weight, name, shard_id,
+                                      expert_id)
+                        break
+                    else:
+                        param = params_dict[name]
+                        weight_loader = getattr(param, "weight_loader",
+                                                default_weight_loader)
+                        weight_loader(param, loaded_weight)
+
+                else:
+                    if ("block_sparse_moe.experts." in name
+                            and name not in params_dict):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)