diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..0c1c0d0 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "files.associations": { + "__locale": "cpp", + "ios": "cpp" + } +} \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index e4b664f..8b07bca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pasta-msm" version = "0.1.4" -edition = "2018" +edition = "2021" license = "Apache-2.0" description = "Optimized multiscalar multiplicaton for Pasta moduli for x86_64 and aarch64" repository = "https://github.com/supranational/pasta-msm" @@ -32,6 +32,7 @@ semolina = "~0.1.3" sppark = { git = "https://github.com/lurk-lab/sppark.git", branch = "pushing-the-limit" } pasta_curves = { git = "https://github.com/lurk-lab/pasta_curves", branch = "dev", version = ">=0.3.1, <=0.5", features = ["repr-c"] } paste = "1.0.14" +rayon = "1.5" [build-dependencies] cc = "^1.0.70" @@ -44,5 +45,9 @@ rand_chacha = "^0" rayon = "1.5" [[bench]] -name = "main" +name = "msm" +harness = false + +[[bench]] +name = "spmvm" harness = false diff --git a/benches/main.rs b/benches/msm.rs similarity index 100% rename from benches/main.rs rename to benches/msm.rs diff --git a/benches/spmvm.rs b/benches/spmvm.rs new file mode 100644 index 0000000..28f3683 --- /dev/null +++ b/benches/spmvm.rs @@ -0,0 +1,110 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(dead_code)] +#![allow(unused_imports)] +#![allow(unused_mut)] +#![allow(non_snake_case)] + +use criterion::{criterion_group, criterion_main, Criterion}; + +use pasta_curves::{group::ff::{PrimeField, Field}, pallas}; +use pasta_msm::{self, utils::SparseMatrix, spmvm::{CudaSparseMatrix, sparse_matrix_witness_pallas, CudaWitness}}; +use rand::Rng; + +#[cfg(feature = "cuda")] +extern "C" { + fn cuda_available() -> bool; +} + +pub fn generate_csr( + n: usize, + m: usize, +) -> SparseMatrix { + let mut rng = rand::thread_rng(); + + let mut data = Vec::new(); + let mut col_idx = Vec::new(); + let mut row_ptr = Vec::new(); + row_ptr.push(0); + + for _ in 0..n { + let num_elements = rng.gen_range(5..=10); // Random number of elements between 5 to 10 + for _ in 0..num_elements { + data.push(F::random(&mut rng)); // Random data value + col_idx.push(rng.gen_range(0..m)); // Random column index + } + row_ptr.push(data.len()); // Add the index of the next row start + } + + data.shrink_to_fit(); + col_idx.shrink_to_fit(); + row_ptr.shrink_to_fit(); + + let csr = SparseMatrix { + data, + indices: col_idx, + indptr: row_ptr, + cols: m, + }; + + csr +} + +include!("../src/tests.rs"); + +fn criterion_benchmark(c: &mut Criterion) { + let bench_npow: usize = std::env::var("BENCH_NPOW") + .unwrap_or("17".to_string()) + .parse() + .unwrap(); + let n: usize = 1 << bench_npow; + + println!("generating random matrix and scalars, just hang on..."); + let csr = generate_csr(n, n); + let cuda_csr = + CudaSparseMatrix::new(&csr.data, &csr.indices, &csr.indptr, n, n); + let W = crate::tests::gen_scalars(n - 10); + let U = crate::tests::gen_scalars(9); + let witness = CudaWitness::new(&W, &pallas::Scalar::ONE, &U); + let scalars = [W.clone(), vec![pallas::Scalar::ONE], U.clone()].concat(); + + #[cfg(feature = "cuda")] + { + unsafe { pasta_msm::CUDA_OFF = true }; + } + + let mut group = c.benchmark_group("CPU"); + group.sample_size(10); + + group.bench_function(format!("2**{} points", bench_npow), |b| { + b.iter(|| { + let _ = csr.multiply_vec(&scalars); + }) + }); + + group.finish(); + + #[cfg(feature = "cuda")] + if unsafe { cuda_available() } { + unsafe { pasta_msm::CUDA_OFF = false }; + + let mut group = c.benchmark_group("GPU"); + group.sample_size(20); + + let mut cuda_res = vec![pallas::Scalar::ONE; cuda_csr.num_rows]; + for nthreads in [128, 256, 512, 1024] { + group.bench_function(format!("2**{} points, nthreads={}", bench_npow, nthreads), |b| { + b.iter(|| { + let _ = sparse_matrix_witness_pallas(&cuda_csr, &witness, &mut cuda_res, nthreads); + }) + }); + } + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/cuda/pallas.cu b/cuda/pallas.cu index 95528f4..7bf8dfd 100644 --- a/cuda/pallas.cu +++ b/cuda/pallas.cu @@ -16,12 +16,24 @@ typedef vesta_t scalar_t; #include #include +#include #ifndef __CUDA_ARCH__ -extern "C" RustError spmvm_pallas(scalar_t *scalars, size_t nscalars) +extern "C" RustError cuda_double_pallas(double_host_t *csr, scalar_t *scalars, scalar_t *out) { - return double_scalars(scalars, nscalars); + return double_scalars(csr, scalars, out); +} + +extern "C" RustError cuda_sparse_matrix_mul_pallas(spmvm_host_t *csr, const scalar_t *scalars, scalar_t *out, size_t nthreads) +{ + return sparse_matrix_mul(csr, scalars, out, nthreads); +} + +extern "C" RustError cuda_sparse_matrix_witness_pallas( + spmvm_host_t *csr, const witness_t *witness, scalar_t *out, size_t nthreads) +{ + return sparse_matrix_witness(csr, witness, out, nthreads); } extern "C" void drop_msm_context_pallas(msm_context_t &ref) diff --git a/examples/spmvm.rs b/examples/spmvm.rs index b83ecb1..6258a43 100644 --- a/examples/spmvm.rs +++ b/examples/spmvm.rs @@ -1,19 +1,51 @@ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +#![allow(non_snake_case)] use std::time::Instant; -use pasta_curves::group::ff::PrimeField; -use pasta_msm::spmvm::double_pallas; +use pasta_curves::{group::ff::{PrimeField, Field}, pallas}; +use pasta_msm::{ + spmvm::{sparse_matrix_mul_pallas, CudaSparseMatrix, CudaWitness, sparse_matrix_witness_pallas}, + utils::SparseMatrix, +}; +use rand::Rng; -pub fn generate_scalars( - len: usize, -) -> Vec { +pub fn generate_csr(n: usize, m: usize) -> SparseMatrix { let mut rng = rand::thread_rng(); - let scalars = (0..len) - .map(|_| F::random(&mut rng)) - .collect::>(); + + let mut data = Vec::new(); + let mut col_idx = Vec::new(); + let mut row_ptr = Vec::new(); + row_ptr.push(0); + + for _ in 0..n { + let num_elements = rng.gen_range(5..=10); // Random number of elements between 5 to 10 + for _ in 0..num_elements { + data.push(F::random(&mut rng)); // Random data value + col_idx.push(rng.gen_range(0..m)); // Random column index + } + row_ptr.push(data.len()); // Add the index of the next row start + } + + data.shrink_to_fit(); + col_idx.shrink_to_fit(); + row_ptr.shrink_to_fit(); + + let csr = SparseMatrix { + data, + indices: col_idx, + indptr: row_ptr, + cols: m, + }; + + csr +} + +pub fn generate_scalars(len: usize) -> Vec { + let mut rng = rand::thread_rng(); + let scalars = (0..len).map(|_| F::random(&mut rng)).collect::>(); scalars } @@ -21,21 +53,32 @@ pub fn generate_scalars( /// cargo run --release --example spmvm fn main() { let npow: usize = std::env::var("NPOW") - .unwrap_or("23".to_string()) + .unwrap_or("17".to_string()) .parse() .unwrap(); let n = 1usize << npow; + let nthreads: usize = std::env::var("NTHREADS") + .unwrap_or("128".to_string()) + .parse() + .unwrap(); - let mut scalars = generate_scalars(n); + let csr = generate_csr(n, n); + let cuda_csr = + CudaSparseMatrix::new(&csr.data, &csr.indices, &csr.indptr, n, n); + let W = generate_scalars(n - 10); + let U = generate_scalars(9); + let scalars = [W.clone(), vec![pallas::Scalar::ONE], U.clone()].concat(); let start = Instant::now(); - let double_scalars = scalars.iter().map(|x| x + x).collect::>(); + let res = csr.multiply_vec(&scalars); println!("cpu took: {:?}", start.elapsed()); + let witness = CudaWitness::new(&W, &pallas::Scalar::ONE, &U); + let mut cuda_res = vec![pallas::Scalar::ONE; cuda_csr.num_rows]; let start = Instant::now(); - double_pallas(&mut scalars); + sparse_matrix_witness_pallas(&cuda_csr, &witness, &mut cuda_res, nthreads); println!("gpu took: {:?}", start.elapsed()); - assert_eq!(double_scalars, scalars); + assert_eq!(res, cuda_res); println!("success!"); } diff --git a/src/lib.rs b/src/lib.rs index d3faa81..0e6ed37 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 pub mod spmvm; +pub mod utils; extern crate semolina; diff --git a/src/spmvm.rs b/src/spmvm.rs index eda10ed..9f3d7af 100644 --- a/src/spmvm.rs +++ b/src/spmvm.rs @@ -1,21 +1,140 @@ -use pasta_curves::pallas; +use std::marker::PhantomData; -pub fn sparse_matrix_mul_pallas(scalars: &mut [pallas::Scalar]) { +use pasta_curves::{group::ff::Field, pallas}; + +#[repr(C)] +pub struct CudaSparseMatrix<'a, F> { + pub data: *const F, + pub col_idx: *const usize, + pub row_ptr: *const usize, + + pub num_rows: usize, + pub num_cols: usize, + pub nnz: usize, + + _p: PhantomData<&'a F>, +} + +impl<'a, F> CudaSparseMatrix<'a, F> { + pub fn new( + data: &[F], + col_idx: &[usize], + row_ptr: &[usize], + num_rows: usize, + num_cols: usize, + ) -> Self { + assert_eq!( + data.len(), + col_idx.len(), + "data and col_idx length mismatch" + ); + assert_eq!( + row_ptr.len(), + num_rows + 1, + "row_ptr length and num_rows mismatch" + ); + + let nnz = data.len(); + CudaSparseMatrix { + data: data.as_ptr(), + col_idx: col_idx.as_ptr(), + row_ptr: row_ptr.as_ptr(), + num_rows, + num_cols, + nnz, + _p: PhantomData, + } + } +} + +#[repr(C)] +#[allow(non_snake_case)] +pub struct CudaWitness<'a, F> { + pub W: *const F, + pub u: *const F, + pub U: *const F, + pub nW: usize, + pub nU: usize, + _p: PhantomData<&'a F>, +} + +#[allow(non_snake_case)] +impl<'a, F> CudaWitness<'a, F> { + pub fn new( + W: &[F], + u: &F, + U: &[F], + ) -> Self { + let nW = W.len(); + let nU = U.len(); + CudaWitness { + W: W.as_ptr(), + u: u as *const _, + U: U.as_ptr(), + nW, + nU, + _p: PhantomData, + } + } +} + +pub fn sparse_matrix_mul_pallas( + csr: &CudaSparseMatrix, + scalars: &[pallas::Scalar], + nthreads: usize, +) -> Vec { extern "C" { - fn spmvm_pallas( - scalars: *mut pallas::Scalar, - nscalars: usize, + fn cuda_sparse_matrix_mul_pallas( + csr: *const CudaSparseMatrix, + scalars: *const pallas::Scalar, + out: *mut pallas::Scalar, + nthreads: usize, ) -> sppark::Error; } - let nscalars = scalars.len(); + let mut out = vec![pallas::Scalar::ZERO; csr.num_rows]; + let err = unsafe { + cuda_sparse_matrix_mul_pallas( + csr as *const _, + scalars.as_ptr(), + out.as_mut_ptr(), + nthreads, + ) + }; + if err.code != 0 { + panic!("{}", String::from(err)); + } + + out +} + +#[allow(non_snake_case)] +pub fn sparse_matrix_witness_pallas( + csr: &CudaSparseMatrix, + witness: &CudaWitness, + buffer: &mut [pallas::Scalar], + nthreads: usize, +) { + extern "C" { + fn cuda_sparse_matrix_witness_pallas( + csr: *const CudaSparseMatrix, + witness: *const CudaWitness, + out: *mut pallas::Scalar, + nthreads: usize, + ) -> sppark::Error; + } + + assert_eq!(witness.nW + witness.nU + 1, csr.num_cols, "invalid witness size"); + let err = unsafe { - spmvm_pallas( - scalars.as_mut_ptr(), - nscalars, + cuda_sparse_matrix_witness_pallas( + csr as *const _, + witness as *const _, + buffer.as_mut_ptr(), + nthreads, ) }; if err.code != 0 { panic!("{}", String::from(err)); } -} \ No newline at end of file +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..403cbd2 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,174 @@ +//! # Sparse Matrices +//! +//! This module defines a custom implementation of CSR/CSC sparse matrices. +//! Specifically, we implement sparse matrix / dense vector multiplication +//! to compute the `A z`, `B z`, and `C z` in Nova. + +use pasta_curves::group::ff::PrimeField; +use rayon::prelude::*; + + +/// CSR format sparse matrix, We follow the names used by scipy. +/// Detailed explanation here: +#[derive(Debug, PartialEq, Eq)] +pub struct SparseMatrix { + /// all non-zero values in the matrix + pub data: Vec, + /// column indices + pub indices: Vec, + /// row information + pub indptr: Vec, + /// number of columns + pub cols: usize, +} + +/// [SparseMatrix]s are often large, and this helps with cloning bottlenecks +impl Clone for SparseMatrix { + fn clone(&self) -> Self { + Self { + data: self.data.par_iter().cloned().collect(), + indices: self.indices.par_iter().cloned().collect(), + indptr: self.indptr.par_iter().cloned().collect(), + cols: self.cols, + } + } +} + +impl SparseMatrix { + /// 0x0 empty matrix + pub fn empty() -> Self { + SparseMatrix { + data: vec![], + indices: vec![], + indptr: vec![0], + cols: 0, + } + } + + /// Construct from the COO representation; Vec. + /// We assume that the rows are sorted during construction. + pub fn new(matrix: &[(usize, usize, F)], rows: usize, cols: usize) -> Self { + let mut new_matrix = vec![vec![]; rows]; + for (row, col, val) in matrix { + new_matrix[*row].push((*col, *val)); + } + + for row in new_matrix.iter() { + assert!(row.windows(2).all(|w| w[0].0 < w[1].0)); + } + + let mut indptr = vec![0; rows + 1]; + for (i, col) in new_matrix.iter().enumerate() { + indptr[i + 1] = indptr[i] + col.len(); + } + + let mut indices = vec![]; + let mut data = vec![]; + for col in new_matrix { + let (idx, val): (Vec<_>, Vec<_>) = col.into_iter().unzip(); + indices.extend(idx); + data.extend(val); + } + + SparseMatrix { + data, + indices, + indptr, + cols, + } + } + + /// Retrieves the data for row slice [i..j] from `ptrs`. + /// We assume that `ptrs` is indexed from `indptrs` and do not check if the + /// returned slice is actually a valid row. + pub fn get_row_unchecked(&self, ptrs: &[usize; 2]) -> impl Iterator { + self.data[ptrs[0]..ptrs[1]] + .iter() + .zip(&self.indices[ptrs[0]..ptrs[1]]) + } + + /// Multiply by a dense vector; uses rayon to parallelize. + pub fn multiply_vec(&self, vector: &[F]) -> Vec { + assert_eq!(self.cols, vector.len(), "invalid shape"); + + self.multiply_vec_unchecked(vector) + } + + /// Multiply by a dense vector; uses rayon to parallelize. + /// This does not check that the shape of the matrix/vector are compatible. + pub fn multiply_vec_unchecked(&self, vector: &[F]) -> Vec { + self + .indptr + .par_windows(2) + .map(|ptrs| { + self + .get_row_unchecked(ptrs.try_into().unwrap()) + .map(|(val, col_idx)| *val * vector[*col_idx]) + .sum() + }) + .collect() + } + + /// number of non-zero entries + pub fn len(&self) -> usize { + *self.indptr.last().unwrap() + } + + /// empty matrix + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// returns a custom iterator + pub fn iter(&self) -> Iter<'_, F> { + let mut row = 0; + while self.indptr[row + 1] == 0 { + row += 1; + } + Iter { + matrix: self, + row, + i: 0, + nnz: *self.indptr.last().unwrap(), + } + } +} + +/// Iterator for sparse matrix +pub struct Iter<'a, F: PrimeField> { + matrix: &'a SparseMatrix, + row: usize, + i: usize, + nnz: usize, +} + +impl<'a, F: PrimeField> Iterator for Iter<'a, F> { + type Item = (usize, usize, F); + + fn next(&mut self) -> Option { + // are we at the end? + if self.i == self.nnz { + return None; + } + + // compute current item + let curr_item = ( + self.row, + self.matrix.indices[self.i], + self.matrix.data[self.i], + ); + + // advance the iterator + self.i += 1; + // edge case at the end + if self.i == self.nnz { + return Some(curr_item); + } + // if `i` has moved to next row + while self.i >= self.matrix.indptr[self.row + 1] { + self.row += 1; + } + + Some(curr_item) + } +} \ No newline at end of file