Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

benching #1

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions ntt/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,68 @@ void coalesced_store(fr_t* inout, index_t idx, const fr_t r[z_count],
inout[idx] = r[z];
}

__global__ void bench_mul_kernel(fr_t a, fr_t b, fr_t *r, size_t n, size_t samples)
{
#ifdef __CUDA_ARCH__
// S f1 = group_gen;
// S f2 = f1 * group_gen_inverse;

int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n)
{
// int scalar_id = tid % n_scalars;
// element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];

fr_t t;

for (int s2 = 0; s2 < samples; s2++)
{
t = t * b;

}

t = a * t;

if (tid == 0)
{
*r = t;
}
}
#endif
}

__launch_bounds__(1024) __global__
void bench_add_kernel(fr_t a, fr_t b, fr_t *r, size_t n, size_t samples)
{
#ifdef __CUDA_ARCH__
// S f1 = group_gen;
// S f2 = f1 * group_gen_inverse;

int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n)
{
// int scalar_id = tid % n_scalars;
// element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];

fr_t t;
// for (int s1 = 0; s1 < samples; s1++)
// {
for (int s2 = 0; s2 < samples; s2++)
{
t = t + b;
}
// }

t = a + t;

if (tid == 0)
{
*r = t;
}
}
#endif
}

#if defined(FEATURE_BABY_BEAR) || defined(FEATURE_GOLDILOCKS)
const static int Z_COUNT = 256/8/sizeof(fr_t);
# include "kernels/gs_mixed_radix_narrow.cu"
Expand Down
38 changes: 38 additions & 0 deletions ntt/ntt.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -291,5 +291,43 @@ public:
}
};

extern "C" int bench_fr_add_cuda(size_t device_id, size_t samples, size_t blocks, size_t threads)
{
fr_t f1 = forward_roots_of_unity[6]; // TODO: any value, random
fr_t f2 = forward_roots_of_unity[7];

fr_t h_answer;
fr_t *d_answer;
cudaMalloc(&d_answer, sizeof(fr_t));
CUDA_OK(cudaGetLastError());

bench_add_kernel<<<blocks, threads>>>(f1, f2, d_answer, (size_t)(blocks * threads), samples);
CUDA_OK(cudaGetLastError());

cudaDeviceSynchronize();

cudaMemcpy(&h_answer, d_answer, sizeof(fr_t), cudaMemcpyDeviceToHost);
cudaFree(d_answer);
return 0;
}

extern "C" int bench_fr_mul_cuda(size_t device_id, size_t samples, size_t blocks, size_t threads)
{
fr_t f1 = forward_roots_of_unity[6]; // TODO: any value, random
fr_t f2 = forward_roots_of_unity[7];

fr_t h_answer;
fr_t *d_answer;
cudaMalloc(&d_answer, sizeof(fr_t));

bench_mul_kernel<<<blocks, threads>>>(f1, f2, d_answer, (size_t)(blocks * threads), samples);

CUDA_OK(cudaGetLastError());
cudaDeviceSynchronize();

cudaMemcpy(&h_answer, d_answer, sizeof(fr_t), cudaMemcpyDeviceToHost);
cudaFree(d_answer);
return 0;
}
#endif
#endif
2 changes: 1 addition & 1 deletion ntt/parameters.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# elif defined(FEATURE_BABY_BEAR)
# define MAX_LG_DOMAIN_SIZE 27
# else
# define MAX_LG_DOMAIN_SIZE 28 // tested only up to 2^31 for now
# define MAX_LG_DOMAIN_SIZE 30 // tested only up to 2^31 for now
# endif
#endif

Expand Down
1 change: 1 addition & 0 deletions poc/msm-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ ark-ec = { version = "0.3.0", features = [ "parallel" ] }
ark-bls12-381 = { version = "0.3.0", optional = true }
ark-bls12-377 = { version = "0.3.0", optional = true }
ark-bn254 = { version = "0.3.0", optional = true }
rayon="*"

[build-dependencies]
cc = "^1.0.70"
Expand Down
24 changes: 20 additions & 4 deletions poc/msm-cuda/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,20 @@ use rand_chacha::ChaCha20Rng;
use ark_ec::{AffineCurve, ProjectiveCurve};
use ark_std::UniformRand;

use rayon::prelude::*;

pub fn generate_points_scalars<G: AffineCurve>(
len: usize,
) -> (Vec<G>, Vec<G::ScalarField>) {
generate_points_scalars_cond(len, true)
}

pub fn generate_points_scalars_cond<G: AffineCurve>(
len: usize,
is_with_p: bool,
) -> (Vec<G>, Vec<G::ScalarField>) {
let scalars_len = len;
let mut len = if is_with_p { len } else { 3 };
let rand_gen: usize = std::cmp::min(1usize << 11, len);
let mut rng = ChaCha20Rng::from_entropy();

Expand All @@ -21,18 +32,23 @@ pub fn generate_points_scalars<G: AffineCurve>(
.collect::<Vec<_>>(),
);
// Sprinkle in some infinity points
if len > 2 {
if len > 3 {
points[3] = G::zero();
}
let scalars = (0..len)
.map(|_| G::ScalarField::rand(&mut rng))
.collect::<Vec<_>>();

while points.len() < len {
points.append(&mut points.clone());
}

points.truncate(len);

let scalars = (0..scalars_len)
.into_par_iter()
.map(|_| {
let mut rng = ChaCha20Rng::from_entropy();
G::ScalarField::rand(&mut rng)
})
.collect::<Vec<_>>();

(points, scalars)
}
14 changes: 12 additions & 2 deletions poc/ntt-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@ version = "0.1.0"
edition = "2021"
publish = false


# [lib]
# crate-type = ["rlib", "staticlib"]

[features]
# By default, compile with ADX extension if the host supports it.
# Binary can be executed on systems similar to the host.
default = []
default = ["bls12_381"]
# Compile in portable mode, without ISA extensions.
# Binary can be executed on all systems.
portable = [ "blst/portable" ]
# Enable ADX even if the host CPU doesn't support it.
# Binary can be executed on Broadwell+ and Ryzen+ systems.
force-adx = [ "blst/force-adx" ]
bls12_377 = []
bls12_381 = []
bls12_381 = ["msm-cuda/bls12_381"]
pallas = [ "semolina" ]
vesta = [ "semolina" ]
bn254 = []
Expand All @@ -27,6 +31,7 @@ quiet = []
blst = "~0.3.11"
semolina = { version = "~0.1.2", optional = true }
sppark = { path = "../../rust" }
msm-cuda = { path = "../msm-cuda", features=["bls12_381"] }

[build-dependencies]
cc = "^1.0.70"
Expand All @@ -41,3 +46,8 @@ ark-bls12-377 = { version = "0.3.0" }
ark-pallas = { version = "0.3.0" }
ark-vesta = { version = "0.3.0" }
ark-bn254 = { version = "0.3.0" }
criterion = { version = "0.3", features = [ "html_reports" ] }

[[bench]]
name = "ntt"
harness = false
74 changes: 74 additions & 0 deletions poc/ntt-cuda/benches/ntt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright Supranational LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

use criterion::{criterion_group, criterion_main, Criterion};

#[cfg(feature = "bls12_377")]
use ark_bls12_377::{G1Affine, G2Affine};
#[cfg(feature = "bls12_381")]
use ark_bls12_381::{G1Affine, G2Affine};
#[cfg(feature = "bn254")]
use ark_bn254::G1Affine;
use ark_ff::BigInteger256;

use std::str::FromStr;

use ntt_cuda::*;
use sppark::*;

use msm_cuda::util;

fn criterion_benchmark(c: &mut Criterion) {
let bench_npow = std::env::var("BENCH_NPOW").unwrap_or("28".to_string());
let npoints_npow = i32::from_str(&bench_npow).unwrap();

let mut group = c.benchmark_group("CUDA");
group.sample_size(20);

let name = format!("2**{}", npoints_npow);
group.bench_function(name, |b| {
let (mut points, mut scalars) =
util::generate_points_scalars_cond::<G1Affine>(1usize << npoints_npow, false);

b.iter(|| {
// let domain_size = 1usize << lg_domain_size;

// let domain = D::new(domain_size).unwrap();

// let mut v = vec![];
// for _ in 0..domain_size {
// v.push(T::rand(rng));
// }

// v.resize(domain.size(), T::zero());
// let mut vtest = v.clone();

// domain.fft_in_place(&mut v);
ntt_cuda::NTT(0, &mut scalars.as_mut_slice(), NTTInputOutputOrder::RN);
// assert!(vtest == v);

// domain.ifft_in_place(&mut v);
// ntt_cuda::iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
// assert!(vtest == v);

// ntt_cuda::NTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NR);
// ntt_cuda::iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::RN);
// assert!(vtest == v);

// domain.coset_fft_in_place(&mut v);
// ntt_cuda::coset_NTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
// assert!(vtest == v);

// domain.coset_ifft_in_place(&mut v);
// ntt_cuda::coset_iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
// assert!(vtest == v);
})
});

group.finish();
}

criterion_group!(benches, criterion_benchmark);

criterion_main!(benches);
60 changes: 60 additions & 0 deletions poc/ntt-cuda/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,63 @@ pub fn coset_iNTT<T>(
panic!("{}", String::from(err));
}
}

use std::time::Instant;

extern "C" {
fn bench_fr_add_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
fn bench_fr_sub_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
fn bench_fr_mul_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
}

pub fn bench_add_fr(samples: usize, blocks: usize, threads: usize) {
unsafe {
bench_fr_add_cuda(0, samples, blocks, threads);
}
}

pub fn bench_sub_fr(samples: usize, blocks: usize, threads: usize) {
unsafe {
bench_fr_sub_cuda(0, samples, blocks, threads);
}
}

pub fn bench_mul_fr(samples: usize, blocks: usize, threads: usize) {
unsafe {
bench_fr_mul_cuda(0, samples, blocks, threads);
}
}

pub fn arith_run() {
use std::str::FromStr;
let bench_npow = std::env::var("ARITH_BENCH_NPOW").unwrap_or("6".to_string());
let npoints_npow = usize::from_str(&bench_npow).unwrap();

for blocks in [128, 256, 1024] {
for threads in [128, 256, 1024] {
for lg_domain_size in 2..=npoints_npow {
let domain_size = 10_usize.pow(lg_domain_size as u32) as usize;
let count = domain_size * blocks * threads;
let name = format!("FR ADD 10**{}*{}*{}", lg_domain_size, blocks, threads);
let start = Instant::now();
bench_add_fr(domain_size, blocks, threads);
let elapsed = start.elapsed();
println!(
"{} = {:?} o/us",
name,
(count as f32) / elapsed.as_micros() as f32,
);

let name = format!("FR MUL 10**{}*{}*{}", lg_domain_size, blocks, threads);
let start = Instant::now();
bench_mul_fr(domain_size, blocks, threads);
let elapsed = start.elapsed();
println!(
"{} = {:?} o/us",
name,
(count as f32) / elapsed.as_micros() as f32,
);
}
}
}
}
6 changes: 6 additions & 0 deletions poc/ntt-cuda/tests/ntt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,9 @@ fn test_against_arkworks() {

test_ntt::<Fr, Fr, _, GeneralEvaluationDomain<Fr>>(rng);
}

#[test]
fn test_arith() {
use ntt_cuda::arith_run;
arith_run();
}
Loading