diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..0c1c0d0
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "files.associations": {
+        "__locale": "cpp",
+        "ios": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index e4b664f..8b07bca 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "pasta-msm"
 version = "0.1.4"
-edition = "2018"
+edition = "2021"
 license = "Apache-2.0"
 description = "Optimized multiscalar multiplicaton for Pasta moduli for x86_64 and aarch64"
 repository = "https://github.com/supranational/pasta-msm"
@@ -32,6 +32,7 @@ semolina = "~0.1.3"
 sppark = { git = "https://github.com/lurk-lab/sppark.git", branch = "pushing-the-limit" }
 pasta_curves = { git = "https://github.com/lurk-lab/pasta_curves", branch = "dev", version = ">=0.3.1, <=0.5", features = ["repr-c"] }
 paste = "1.0.14"
+rayon = "1.5"
 
 [build-dependencies]
 cc = "^1.0.70"
@@ -44,5 +45,9 @@ rand_chacha = "^0"
 rayon = "1.5"
 
 [[bench]]
-name = "main"
+name = "msm"
+harness = false
+
+[[bench]]
+name = "spmvm"
 harness = false
diff --git a/benches/main.rs b/benches/msm.rs
similarity index 100%
rename from benches/main.rs
rename to benches/msm.rs
diff --git a/benches/spmvm.rs b/benches/spmvm.rs
new file mode 100644
index 0000000..28f3683
--- /dev/null
+++ b/benches/spmvm.rs
@@ -0,0 +1,110 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#![allow(dead_code)]
+#![allow(unused_imports)]
+#![allow(unused_mut)]
+#![allow(non_snake_case)]
+
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use pasta_curves::{group::ff::{PrimeField, Field}, pallas};
+use pasta_msm::{self, utils::SparseMatrix, spmvm::{CudaSparseMatrix, sparse_matrix_witness_pallas, CudaWitness}};
+use rand::Rng;
+
+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_available() -> bool;
+}
+
+pub fn generate_csr<F: PrimeField>(
+    n: usize,
+    m: usize,
+) -> SparseMatrix<F> {
+    let mut rng = rand::thread_rng();
+
+    let mut data = Vec::new();
+    let mut col_idx = Vec::new();
+    let mut row_ptr = Vec::new();
+    row_ptr.push(0);
+
+    for _ in 0..n {
+        let num_elements = rng.gen_range(5..=10); // Random number of elements between 5 to 10
+        for _ in 0..num_elements {
+            data.push(F::random(&mut rng)); // Random data value
+            col_idx.push(rng.gen_range(0..m)); // Random column index
+        }
+        row_ptr.push(data.len()); // Add the index of the next row start
+    }
+
+    data.shrink_to_fit();
+    col_idx.shrink_to_fit();
+    row_ptr.shrink_to_fit();
+
+    let csr = SparseMatrix {
+        data,
+        indices: col_idx,
+        indptr: row_ptr,
+        cols: m,
+    };
+
+    csr
+}
+
+include!("../src/tests.rs");
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let bench_npow: usize = std::env::var("BENCH_NPOW")
+        .unwrap_or("17".to_string())
+        .parse()
+        .unwrap();
+    let n: usize = 1 << bench_npow;
+
+    println!("generating random matrix and scalars, just hang on...");
+    let csr = generate_csr(n, n);
+    let cuda_csr =
+        CudaSparseMatrix::new(&csr.data, &csr.indices, &csr.indptr, n, n);
+    let W = crate::tests::gen_scalars(n - 10);
+    let U = crate::tests::gen_scalars(9);
+    let witness = CudaWitness::new(&W, &pallas::Scalar::ONE, &U);
+    let scalars = [W.clone(), vec![pallas::Scalar::ONE], U.clone()].concat();
+
+    #[cfg(feature = "cuda")]
+    {
+        unsafe { pasta_msm::CUDA_OFF = true };
+    }
+
+    let mut group = c.benchmark_group("CPU");
+    group.sample_size(10);
+
+    group.bench_function(format!("2**{} points", bench_npow), |b| {
+        b.iter(|| {
+            let _ = csr.multiply_vec(&scalars);
+        })
+    });
+
+    group.finish();
+
+    #[cfg(feature = "cuda")]
+    if unsafe { cuda_available() } {
+        unsafe { pasta_msm::CUDA_OFF = false };
+
+        let mut group = c.benchmark_group("GPU");
+        group.sample_size(20);
+
+        let mut cuda_res = vec![pallas::Scalar::ONE; cuda_csr.num_rows];
+        for nthreads in [128, 256, 512, 1024] {
+            group.bench_function(format!("2**{} points, nthreads={}", bench_npow, nthreads), |b| {
+                b.iter(|| {
+                    let _ = sparse_matrix_witness_pallas(&cuda_csr, &witness, &mut cuda_res, nthreads);
+                })
+            });
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/cuda/pallas.cu b/cuda/pallas.cu
index 95528f4..7bf8dfd 100644
--- a/cuda/pallas.cu
+++ b/cuda/pallas.cu
@@ -16,12 +16,24 @@ typedef vesta_t scalar_t;
 
 #include <msm/pippenger.cuh>
 #include <spmvm/spmvm.cuh>
+#include <spmvm/double.cuh>
 
 #ifndef __CUDA_ARCH__
 
-extern "C" RustError spmvm_pallas(scalar_t *scalars, size_t nscalars)
+extern "C" RustError cuda_double_pallas(double_host_t<scalar_t> *csr, scalar_t *scalars, scalar_t *out)
 {
-    return double_scalars<scalar_t>(scalars, nscalars);
+    return double_scalars<scalar_t>(csr, scalars, out);
+}
+
+extern "C" RustError cuda_sparse_matrix_mul_pallas(spmvm_host_t<scalar_t> *csr, const scalar_t *scalars, scalar_t *out, size_t nthreads)
+{
+    return sparse_matrix_mul<scalar_t>(csr, scalars, out, nthreads);
+}
+
+extern "C" RustError cuda_sparse_matrix_witness_pallas(
+    spmvm_host_t<scalar_t> *csr, const witness_t<scalar_t> *witness, scalar_t *out, size_t nthreads)
+{
+    return sparse_matrix_witness<scalar_t>(csr, witness, out, nthreads);
 }
 
 extern "C" void drop_msm_context_pallas(msm_context_t<affine_t::mem_t> &ref)
diff --git a/examples/spmvm.rs b/examples/spmvm.rs
index b83ecb1..6258a43 100644
--- a/examples/spmvm.rs
+++ b/examples/spmvm.rs
@@ -1,19 +1,51 @@
 // Copyright Supranational LLC
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
+#![allow(non_snake_case)]
 
 use std::time::Instant;
 
-use pasta_curves::group::ff::PrimeField;
-use pasta_msm::spmvm::double_pallas;
+use pasta_curves::{group::ff::{PrimeField, Field}, pallas};
+use pasta_msm::{
+    spmvm::{sparse_matrix_mul_pallas, CudaSparseMatrix, CudaWitness, sparse_matrix_witness_pallas},
+    utils::SparseMatrix,
+};
+use rand::Rng;
 
-pub fn generate_scalars<F: PrimeField>(
-    len: usize,
-) -> Vec<F> {
+pub fn generate_csr<F: PrimeField>(n: usize, m: usize) -> SparseMatrix<F> {
     let mut rng = rand::thread_rng();
-    let scalars = (0..len)
-        .map(|_| F::random(&mut rng))
-        .collect::<Vec<_>>();
+
+    let mut data = Vec::new();
+    let mut col_idx = Vec::new();
+    let mut row_ptr = Vec::new();
+    row_ptr.push(0);
+
+    for _ in 0..n {
+        let num_elements = rng.gen_range(5..=10); // Random number of elements between 5 to 10
+        for _ in 0..num_elements {
+            data.push(F::random(&mut rng)); // Random data value
+            col_idx.push(rng.gen_range(0..m)); // Random column index
+        }
+        row_ptr.push(data.len()); // Add the index of the next row start
+    }
+
+    data.shrink_to_fit();
+    col_idx.shrink_to_fit();
+    row_ptr.shrink_to_fit();
+
+    let csr = SparseMatrix {
+        data,
+        indices: col_idx,
+        indptr: row_ptr,
+        cols: m,
+    };
+
+    csr
+}
+
+pub fn generate_scalars<F: PrimeField>(len: usize) -> Vec<F> {
+    let mut rng = rand::thread_rng();
+    let scalars = (0..len).map(|_| F::random(&mut rng)).collect::<Vec<_>>();
 
     scalars
 }
@@ -21,21 +53,32 @@ pub fn generate_scalars<F: PrimeField>(
 /// cargo run --release --example spmvm
 fn main() {
     let npow: usize = std::env::var("NPOW")
-        .unwrap_or("23".to_string())
+        .unwrap_or("17".to_string())
         .parse()
         .unwrap();
     let n = 1usize << npow;
+    let nthreads: usize = std::env::var("NTHREADS")
+        .unwrap_or("128".to_string())
+        .parse()
+        .unwrap();
 
-    let mut scalars = generate_scalars(n);
+    let csr = generate_csr(n, n);
+    let cuda_csr =
+        CudaSparseMatrix::new(&csr.data, &csr.indices, &csr.indptr, n, n);
+    let W = generate_scalars(n - 10);
+    let U = generate_scalars(9);
+    let scalars = [W.clone(), vec![pallas::Scalar::ONE], U.clone()].concat();
 
     let start = Instant::now();
-    let double_scalars = scalars.iter().map(|x| x + x).collect::<Vec<_>>();
+    let res = csr.multiply_vec(&scalars);
     println!("cpu took: {:?}", start.elapsed());
 
+    let witness = CudaWitness::new(&W, &pallas::Scalar::ONE, &U);
+    let mut cuda_res = vec![pallas::Scalar::ONE; cuda_csr.num_rows];
     let start = Instant::now();
-    double_pallas(&mut scalars);
+    sparse_matrix_witness_pallas(&cuda_csr, &witness, &mut cuda_res, nthreads);
     println!("gpu took: {:?}", start.elapsed());
 
-    assert_eq!(double_scalars, scalars);
+    assert_eq!(res, cuda_res);
     println!("success!");
 }
diff --git a/src/lib.rs b/src/lib.rs
index d3faa81..0e6ed37 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 pub mod spmvm;
+pub mod utils;
 
 extern crate semolina;
 
diff --git a/src/spmvm.rs b/src/spmvm.rs
index eda10ed..9f3d7af 100644
--- a/src/spmvm.rs
+++ b/src/spmvm.rs
@@ -1,21 +1,140 @@
-use pasta_curves::pallas;
+use std::marker::PhantomData;
 
-pub fn sparse_matrix_mul_pallas(scalars: &mut [pallas::Scalar]) {
+use pasta_curves::{group::ff::Field, pallas};
+
+#[repr(C)]
+pub struct CudaSparseMatrix<'a, F> {
+    pub data: *const F,
+    pub col_idx: *const usize,
+    pub row_ptr: *const usize,
+
+    pub num_rows: usize,
+    pub num_cols: usize,
+    pub nnz: usize,
+
+    _p: PhantomData<&'a F>,
+}
+
+impl<'a, F> CudaSparseMatrix<'a, F> {
+    pub fn new(
+        data: &[F],
+        col_idx: &[usize],
+        row_ptr: &[usize],
+        num_rows: usize,
+        num_cols: usize,
+    ) -> Self {
+        assert_eq!(
+            data.len(),
+            col_idx.len(),
+            "data and col_idx length mismatch"
+        );
+        assert_eq!(
+            row_ptr.len(),
+            num_rows + 1,
+            "row_ptr length and num_rows mismatch"
+        );
+
+        let nnz = data.len();
+        CudaSparseMatrix {
+            data: data.as_ptr(),
+            col_idx: col_idx.as_ptr(),
+            row_ptr: row_ptr.as_ptr(),
+            num_rows,
+            num_cols,
+            nnz,
+            _p: PhantomData,
+        }
+    }
+}
+
+#[repr(C)]
+#[allow(non_snake_case)]
+pub struct CudaWitness<'a, F> {
+    pub W: *const F,
+    pub u: *const F,
+    pub U: *const F,
+    pub nW: usize,
+    pub nU: usize,
+    _p: PhantomData<&'a F>,
+}
+
+#[allow(non_snake_case)]
+impl<'a, F> CudaWitness<'a, F> {
+    pub fn new(
+        W: &[F],
+        u: &F,
+        U: &[F],
+    ) -> Self {
+        let nW = W.len();
+        let nU = U.len();
+        CudaWitness {
+            W: W.as_ptr(),
+            u: u as *const _,
+            U: U.as_ptr(),
+            nW,
+            nU,
+            _p: PhantomData,
+        }
+    }
+}
+
+pub fn sparse_matrix_mul_pallas(
+    csr: &CudaSparseMatrix<pallas::Scalar>,
+    scalars: &[pallas::Scalar],
+    nthreads: usize,
+) -> Vec<pallas::Scalar> {
     extern "C" {
-        fn spmvm_pallas(
-            scalars: *mut pallas::Scalar,
-            nscalars: usize,
+        fn cuda_sparse_matrix_mul_pallas(
+            csr: *const CudaSparseMatrix<pallas::Scalar>,
+            scalars: *const pallas::Scalar,
+            out: *mut pallas::Scalar,
+            nthreads: usize,
         ) -> sppark::Error;
     }
 
-    let nscalars = scalars.len();
+    let mut out = vec![pallas::Scalar::ZERO; csr.num_rows];
+    let err = unsafe {
+        cuda_sparse_matrix_mul_pallas(
+            csr as *const _,
+            scalars.as_ptr(),
+            out.as_mut_ptr(),
+            nthreads,
+        )
+    };
+    if err.code != 0 {
+        panic!("{}", String::from(err));
+    }
+
+    out
+}
+
+#[allow(non_snake_case)]
+pub fn sparse_matrix_witness_pallas(
+    csr: &CudaSparseMatrix<pallas::Scalar>,
+    witness: &CudaWitness<pallas::Scalar>,
+    buffer: &mut [pallas::Scalar],
+    nthreads: usize,
+) {
+    extern "C" {
+        fn cuda_sparse_matrix_witness_pallas(
+            csr: *const CudaSparseMatrix<pallas::Scalar>,
+            witness: *const CudaWitness<pallas::Scalar>,
+            out: *mut pallas::Scalar,
+            nthreads: usize,
+        ) -> sppark::Error;
+    }
+
+    assert_eq!(witness.nW + witness.nU + 1, csr.num_cols, "invalid witness size");
+
     let err = unsafe {
-        spmvm_pallas(
-            scalars.as_mut_ptr(),
-            nscalars,
+        cuda_sparse_matrix_witness_pallas(
+            csr as *const _,
+            witness as *const _,
+            buffer.as_mut_ptr(),
+            nthreads,
         )
     };
     if err.code != 0 {
         panic!("{}", String::from(err));
     }
-}
\ No newline at end of file
+}
diff --git a/src/utils.rs b/src/utils.rs
new file mode 100644
index 0000000..403cbd2
--- /dev/null
+++ b/src/utils.rs
@@ -0,0 +1,174 @@
+//! # Sparse Matrices
+//!
+//! This module defines a custom implementation of CSR/CSC sparse matrices.
+//! Specifically, we implement sparse matrix / dense vector multiplication
+//! to compute the `A z`, `B z`, and `C z` in Nova.
+
+use pasta_curves::group::ff::PrimeField;
+use rayon::prelude::*;
+
+
+/// CSR format sparse matrix, We follow the names used by scipy.
+/// Detailed explanation here: <https://stackoverflow.com/questions/52299420/scipy-csr-matrix-understand-indptr>
+#[derive(Debug, PartialEq, Eq)]
+pub struct SparseMatrix<F: PrimeField> {
+  /// all non-zero values in the matrix
+  pub data: Vec<F>,
+  /// column indices
+  pub indices: Vec<usize>,
+  /// row information
+  pub indptr: Vec<usize>,
+  /// number of columns
+  pub cols: usize,
+}
+
+/// [SparseMatrix]s are often large, and this helps with cloning bottlenecks
+impl<F: PrimeField> Clone for SparseMatrix<F> {
+  fn clone(&self) -> Self {
+    Self {
+      data: self.data.par_iter().cloned().collect(),
+      indices: self.indices.par_iter().cloned().collect(),
+      indptr: self.indptr.par_iter().cloned().collect(),
+      cols: self.cols,
+    }
+  }
+}
+
+impl<F: PrimeField> SparseMatrix<F> {
+  /// 0x0 empty matrix
+  pub fn empty() -> Self {
+    SparseMatrix {
+      data: vec![],
+      indices: vec![],
+      indptr: vec![0],
+      cols: 0,
+    }
+  }
+
+  /// Construct from the COO representation; Vec<usize(row), usize(col), F>.
+  /// We assume that the rows are sorted during construction.
+  pub fn new(matrix: &[(usize, usize, F)], rows: usize, cols: usize) -> Self {
+    let mut new_matrix = vec![vec![]; rows];
+    for (row, col, val) in matrix {
+      new_matrix[*row].push((*col, *val));
+    }
+
+    for row in new_matrix.iter() {
+      assert!(row.windows(2).all(|w| w[0].0 < w[1].0));
+    }
+
+    let mut indptr = vec![0; rows + 1];
+    for (i, col) in new_matrix.iter().enumerate() {
+      indptr[i + 1] = indptr[i] + col.len();
+    }
+
+    let mut indices = vec![];
+    let mut data = vec![];
+    for col in new_matrix {
+      let (idx, val): (Vec<_>, Vec<_>) = col.into_iter().unzip();
+      indices.extend(idx);
+      data.extend(val);
+    }
+
+    SparseMatrix {
+      data,
+      indices,
+      indptr,
+      cols,
+    }
+  }
+
+  /// Retrieves the data for row slice [i..j] from `ptrs`.
+  /// We assume that `ptrs` is indexed from `indptrs` and do not check if the
+  /// returned slice is actually a valid row.
+  pub fn get_row_unchecked(&self, ptrs: &[usize; 2]) -> impl Iterator<Item = (&F, &usize)> {
+    self.data[ptrs[0]..ptrs[1]]
+      .iter()
+      .zip(&self.indices[ptrs[0]..ptrs[1]])
+  }
+
+  /// Multiply by a dense vector; uses rayon to parallelize.
+  pub fn multiply_vec(&self, vector: &[F]) -> Vec<F> {
+    assert_eq!(self.cols, vector.len(), "invalid shape");
+
+    self.multiply_vec_unchecked(vector)
+  }
+
+  /// Multiply by a dense vector; uses rayon to parallelize.
+  /// This does not check that the shape of the matrix/vector are compatible.
+  pub fn multiply_vec_unchecked(&self, vector: &[F]) -> Vec<F> {
+    self
+      .indptr
+      .par_windows(2)
+      .map(|ptrs| {
+        self
+          .get_row_unchecked(ptrs.try_into().unwrap())
+          .map(|(val, col_idx)| *val * vector[*col_idx])
+          .sum()
+      })
+      .collect()
+  }
+
+  /// number of non-zero entries
+  pub fn len(&self) -> usize {
+    *self.indptr.last().unwrap()
+  }
+
+  /// empty matrix
+  pub fn is_empty(&self) -> bool {
+    self.len() == 0
+  }
+
+  /// returns a custom iterator
+  pub fn iter(&self) -> Iter<'_, F> {
+    let mut row = 0;
+    while self.indptr[row + 1] == 0 {
+      row += 1;
+    }
+    Iter {
+      matrix: self,
+      row,
+      i: 0,
+      nnz: *self.indptr.last().unwrap(),
+    }
+  }
+}
+
+/// Iterator for sparse matrix
+pub struct Iter<'a, F: PrimeField> {
+  matrix: &'a SparseMatrix<F>,
+  row: usize,
+  i: usize,
+  nnz: usize,
+}
+
+impl<'a, F: PrimeField> Iterator for Iter<'a, F> {
+  type Item = (usize, usize, F);
+
+  fn next(&mut self) -> Option<Self::Item> {
+    // are we at the end?
+    if self.i == self.nnz {
+      return None;
+    }
+
+    // compute current item
+    let curr_item = (
+      self.row,
+      self.matrix.indices[self.i],
+      self.matrix.data[self.i],
+    );
+
+    // advance the iterator
+    self.i += 1;
+    // edge case at the end
+    if self.i == self.nnz {
+      return Some(curr_item);
+    }
+    // if `i` has moved to next row
+    while self.i >= self.matrix.indptr[self.row + 1] {
+      self.row += 1;
+    }
+
+    Some(curr_item)
+  }
+}
\ No newline at end of file