diff --git a/crates/prover/benches/merkle.rs b/crates/prover/benches/merkle.rs index c6735cdb6..0ff92235e 100644 --- a/crates/prover/benches/merkle.rs +++ b/crates/prover/benches/merkle.rs @@ -1,59 +1,42 @@ #![feature(iter_array_chunks)] -use criterion::Criterion; +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; +use itertools::Itertools; +use num_traits::Zero; +use stwo_prover::core::backend::simd::SimdBackend; +use stwo_prover::core::backend::{CPUBackend, Col}; +use stwo_prover::core::fields::m31::BaseField; +use stwo_prover::core::vcs::blake2_merkle::Blake2sMerkleHasher; +use stwo_prover::core::vcs::ops::MerkleOps; -#[cfg(target_arch = "x86_64")] -pub fn cpu_merkle(c: &mut criterion::Criterion) { - use itertools::Itertools; - use num_traits::Zero; - use stwo_prover::core::backend::avx512::AVX512Backend; - use stwo_prover::core::backend::{CPUBackend, Col}; - use stwo_prover::core::fields::m31::BaseField; - use stwo_prover::core::vcs::ops::MerkleOps; - use stwo_prover::platform; +const LOG_N_ROWS: u32 = 16; - const N_COLS: usize = 1 << 8; - const LOG_SIZE: u32 = 16; - let cols = (0..N_COLS) - .map(|_| { - (0..(1 << LOG_SIZE)) - .map(|_| BaseField::zero()) - .collect::>() - }) - .collect::>(); +const LOG_N_COLS: u32 = 8; +fn bench_blake2s_merkle>(c: &mut Criterion, id: &str) { + let col: Col = (0..1 << LOG_N_ROWS).map(|_| BaseField::zero()).collect(); + let cols = (0..1 << LOG_N_COLS).map(|_| col.clone()).collect_vec(); + let col_refs = cols.iter().collect_vec(); let mut group = c.benchmark_group("merkle throughput"); - group.throughput(criterion::Throughput::Elements((N_COLS << LOG_SIZE) as u64)); - group.throughput(criterion::Throughput::Bytes( - (N_COLS << (LOG_SIZE + 2)) as u64, - )); - group.bench_function("cpu merkle", |b| { - b.iter(|| { - CPUBackend::commit_on_layer(LOG_SIZE, None, &cols.iter().collect_vec()); - }) + group.throughput(Throughput::Elements(1 << (LOG_N_COLS + LOG_N_ROWS))); + group.throughput(Throughput::Bytes(4 << (LOG_N_COLS + LOG_N_ROWS))); + group.bench_function(&format!("{id} merkle"), |b| { + b.iter_with_large_drop(|| B::commit_on_layer(LOG_N_ROWS, None, &col_refs)) }); +} - if !platform::avx512_detected() { - return; +fn blake2s_merkle_benches(c: &mut Criterion) { + #[cfg(target_arch = "x86_64")] + if stwo_prover::platform::avx512_detected() { + use stwo_prover::core::backend::avx512::AVX512Backend; + bench_blake2s_merkle::(c, "avx"); } - let cols = (0..N_COLS) - .map(|_| { - (0..(1 << LOG_SIZE)) - .map(|_| BaseField::zero()) - .collect::>() - }) - .collect::>(); - - group.bench_function("avx merkle", |b| { - b.iter(|| { - AVX512Backend::commit_on_layer(LOG_SIZE, None, &cols.iter().collect_vec()); - }) - }); + bench_blake2s_merkle::(c, "simd"); + bench_blake2s_merkle::(c, "cpu"); } -#[cfg(target_arch = "x86_64")] -criterion::criterion_group!( - name=merkle; +criterion_group!( + name = benches; config = Criterion::default().sample_size(10); - targets=cpu_merkle); -criterion::criterion_main!(merkle); + targets = blake2s_merkle_benches); +criterion_main!(benches); diff --git a/crates/prover/src/core/backend/simd/blake2s.rs b/crates/prover/src/core/backend/simd/blake2s.rs index 77b9a1b3f..58b74b7ae 100644 --- a/crates/prover/src/core/backend/simd/blake2s.rs +++ b/crates/prover/src/core/backend/simd/blake2s.rs @@ -110,7 +110,8 @@ fn rotate(x: u32x16) -> u32x16 { (x >> N) | (x << (u32::BITS - N)) } -#[inline] +// `inline(always)` can cause code parsing errors for wasm: "locals exceed maximum". +#[cfg_attr(not(target_arch = "wasm32"), inline(always))] fn round(v: &mut [u32x16; 16], m: [u32x16; 16], r: usize) { v[0] += m[SIGMA[r][0] as usize]; v[1] += m[SIGMA[r][2] as usize];