Skip to content

Commit

Permalink
perf: improve SIMD codegen
Browse files Browse the repository at this point in the history
- Improved the way we dispatch to SIMD-intensive functions.
This results in slightly larger binaries, but *massive* speedups –
throughput increase of 5, 10, 20, or in case of `google_map::travel_modes/rsonpath_direct_count`
59 (fifty-nine) percent.
  • Loading branch information
V0ldek authored Oct 13, 2023
1 parent 81e580b commit 56472ed
Show file tree
Hide file tree
Showing 45 changed files with 772 additions and 414 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"memmem",
"Mmap",
"mmaps",
"monomorphizing",
"movemask",
"ndash",
"nondescendant",
Expand Down
1 change: 1 addition & 0 deletions base.json

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions bench.nu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/home/mat/.cargo/bin/nu

def main [n: int] {
mut i = 0
while $i < $n {
/tmp/rqbase '$.products[*].videoChapters' ./crates/rsonpath-benchmarks/data/pison/bestbuy_short_record.json -rcount out> /dev/null
$i += 1
}
}
2 changes: 1 addition & 1 deletion crates/rsonpath-benchmarks
77 changes: 13 additions & 64 deletions crates/rsonpath-lib/src/classification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@ pub mod depth;
pub(crate) mod mask;
pub mod memmem;
pub mod quotes;
pub mod simd;
pub(crate) mod simd;
pub mod structural;

use crate::{
debug,
input::{error::InputError, InputBlockIterator},
};
use std::fmt::Display;

use crate::{debug, input::InputBlockIterator};
use quotes::{QuoteClassifiedBlock, QuoteClassifiedIterator};

/// State allowing resumption of a classifier from a particular place
Expand Down Expand Up @@ -61,7 +60,7 @@ where
{
/// Get the index in the original bytes input at which classification has stopped.
#[inline(always)]
pub fn get_idx(&self) -> usize {
pub(crate) fn get_idx(&self) -> usize {
debug!(
"iter offset: {}, block idx: {:?}",
self.iter.get_offset(),
Expand All @@ -70,63 +69,13 @@ where

self.iter.get_offset() + self.block.as_ref().map_or(0, |b| b.idx)
}
}

/// Move the state forward to `index`.
///
/// # Errors
/// If the offset crosses block boundaries, then a new block is read from the underlying
/// [`Input`](crate::input::Input) implementation, which can fail.
///
/// # Panics
/// If the `index` is not ahead of the current position of the state ([`get_idx`](ResumeClassifierState::get_idx)).
#[inline]
#[allow(clippy::panic_in_result_fn)]
pub fn forward_to(&mut self, index: usize) -> Result<(), InputError> {
let current_block_start = self.iter.get_offset();
let current_block_idx = self.block.as_ref().map_or(0, |b| b.idx);
let current_idx = current_block_start + current_block_idx;

debug!(
"Calling forward_to({index}) when the inner iter offset is {current_block_start} and block idx is {current_block_idx:?}"
);

// We want to move by this much forward, and delta > 0.
assert!(index > current_idx);
let delta = index - current_idx;

// First we virtually pretend to move *backward*, setting the index of the current block to zero,
// and adjust the delta to cover that distance. This makes calculations simpler.
// Then we need to skip zero or more blocks and set our self.block to the last one we visit.
let remaining = delta + current_block_idx;
let blocks_to_skip = remaining / N;
let remainder = remaining % N;

match self.block.as_mut() {
Some(b) if blocks_to_skip == 0 => {
b.idx = remaining;
}
Some(_) => {
self.block = self
.iter
.offset(blocks_to_skip as isize)?
.map(|b| ResumeClassifierBlockState {
block: b,
idx: remainder,
});
}
None => {
self.block = self
.iter
.offset((blocks_to_skip + 1) as isize)?
.map(|b| ResumeClassifierBlockState {
block: b,
idx: remainder,
});
}
}

debug!("forward_to({index}) results in idx moved to {}", self.get_idx());

Ok(())
}
/// Get a human-readable description of SIMD capabilities supported by rsonpath
/// on the current machine.
#[doc(hidden)]
#[inline]
#[must_use]
pub fn describe_simd() -> impl Display {
simd::configure()
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ use crate::{
FallibleIterator,
};

use super::simd::simd_dispatch;
use super::simd::config_simd;

fn classify_string(json: &str) -> Vec<Structural> {
let simd = simd::configure();

simd_dispatch!(simd => |simd| {
config_simd!(simd => |simd| {
let json_string = json.to_owned();
let bytes = OwnedBytes::try_from(json_string).unwrap();
let iter = bytes.iter_blocks(&EmptyRecorder);
Expand Down
8 changes: 3 additions & 5 deletions crates/rsonpath-lib/src/classification/depth/avx2_32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ const SIZE: usize = 32;

shared::depth_classifier!(Avx2VectorIterator32, DelimiterClassifierImpl256, DepthVector32, 32, u32);

#[inline]
#[inline(always)]
fn new_vector<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u32, SIZE>,
classifier: &DelimiterClassifierImpl256,
) -> DepthVector32<'a, B> {
new_vector_from(bytes, classifier, 0)
}

#[inline]
#[inline(always)]
fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u32, SIZE>,
classifier: &DelimiterClassifierImpl256,
Expand All @@ -29,9 +29,7 @@ fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
unsafe { new_avx2(bytes, classifier, idx) }
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "popcnt")]
#[inline]
#[inline(always)]
unsafe fn new_avx2<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u32, SIZE>,
classifier: &DelimiterClassifierImpl256,
Expand Down
8 changes: 3 additions & 5 deletions crates/rsonpath-lib/src/classification/depth/avx2_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ const SIZE: usize = 64;

shared::depth_classifier!(Avx2VectorIterator64, DelimiterClassifierImpl256, DepthVector64, 64, u64);

#[inline]
#[inline(always)]
fn new_vector<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
classifier: &DelimiterClassifierImpl256,
) -> DepthVector64<'a, B> {
new_vector_from(bytes, classifier, 0)
}

#[inline]
#[inline(always)]
fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
classifier: &DelimiterClassifierImpl256,
Expand All @@ -30,9 +30,7 @@ fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
unsafe { new_avx2(bytes, classifier, idx) }
}

#[target_feature(enable = "avx2")]
#[target_feature(enable = "popcnt")]
#[inline]
#[inline(always)]
unsafe fn new_avx2<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
classifier: &DelimiterClassifierImpl256,
Expand Down
2 changes: 2 additions & 0 deletions crates/rsonpath-lib/src/classification/depth/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ macro_rules! depth_classifier {
{
type Block = $vector<'a, I::Block>;

#[inline(always)]
fn stop(self, block: Option<Self::Block>) -> ResumeClassifierState<'a, I, Q, $mask_ty, $size> {
let block_state = block.and_then(|b| {
let idx = b.idx;
Expand All @@ -85,6 +86,7 @@ macro_rules! depth_classifier {
}
}

#[inline(always)]
fn resume(
state: ResumeClassifierState<'a, I, Q, $mask_ty, $size>,
opening: BracketType,
Expand Down
15 changes: 6 additions & 9 deletions crates/rsonpath-lib/src/classification/depth/shared/mask_32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,10 @@ pub(crate) struct DepthVector32<'a, B: InputBlock<'a, SIZE>> {
pub(crate) phantom: PhantomData<&'a ()>,
}

#[target_feature(enable = "popcnt")]
#[inline]
unsafe fn popcnt(mask: u32) -> i32 {
mask.count_ones() as i32
}

impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector32<'a, B> {
#[inline(always)]
fn advance_to_next_depth_decrease(&mut self) -> bool {
debug_assert!(is_x86_feature_detected!("popcnt"));
let next_closing = self.closing_mask.trailing_zeros() as usize;

if next_closing == SIZE {
Expand All @@ -52,7 +47,7 @@ impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector32<'a, B> {
bin_u32!("new opening_mask", self.opening_mask);
bin_u32!("new closing_mask", self.closing_mask);

let new_opening_count = unsafe { popcnt(self.opening_mask) };
let new_opening_count = self.opening_mask.count_ones() as i32;
let delta = (self.opening_count as i32) - new_opening_count - 1;
self.opening_count = new_opening_count as u32;

Expand All @@ -73,7 +68,8 @@ impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector32<'a, B> {

#[inline(always)]
fn depth_at_end(&self) -> isize {
(((self.opening_count as i32) - unsafe { popcnt(self.closing_mask) }) + self.depth) as isize
debug_assert!(is_x86_feature_detected!("popcnt"));
(((self.opening_count as i32) - self.closing_mask.count_ones() as i32) + self.depth) as isize
}

#[inline(always)]
Expand All @@ -83,6 +79,7 @@ impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector32<'a, B> {

#[inline(always)]
fn estimate_lowest_possible_depth(&self) -> isize {
(self.depth - unsafe { popcnt(self.closing_mask) }) as isize
debug_assert!(is_x86_feature_detected!("popcnt"));
(self.depth - self.closing_mask.count_ones() as i32) as isize
}
}
15 changes: 3 additions & 12 deletions crates/rsonpath-lib/src/classification/depth/shared/mask_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ pub(crate) struct DepthVector64<'a, B: InputBlock<'a, SIZE>> {
pub(crate) phantom: PhantomData<&'a ()>,
}

#[target_feature(enable = "popcnt")]
#[inline]
unsafe fn popcnt(mask: u64) -> i32 {
mask.count_ones() as i32
}

impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector64<'a, B> {
#[inline(always)]
fn advance_to_next_depth_decrease(&mut self) -> bool {
Expand All @@ -53,8 +47,7 @@ impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector64<'a, B> {
bin_u64!("new opening_mask", self.opening_mask);
bin_u64!("new closing_mask", self.closing_mask);

// SAFETY: This module is meant to be included only under enabled popcnt.
let new_opening_count = unsafe { popcnt(self.opening_mask) };
let new_opening_count = self.opening_mask.count_ones() as i32;
let delta = (self.opening_count as i32) - new_opening_count - 1;
self.opening_count = new_opening_count as u32;

Expand All @@ -76,8 +69,7 @@ impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector64<'a, B> {
#[inline(always)]
fn depth_at_end(&self) -> isize {
debug_assert!(is_x86_feature_detected!("popcnt"));
// SAFETY: This module is meant to be included only under enabled popcnt.
(((self.opening_count as i32) - unsafe { popcnt(self.closing_mask) }) + self.depth) as isize
(((self.opening_count as i32) - self.closing_mask.count_ones() as i32) + self.depth) as isize
}

#[inline(always)]
Expand All @@ -88,7 +80,6 @@ impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVector64<'a, B> {
#[inline(always)]
fn estimate_lowest_possible_depth(&self) -> isize {
debug_assert!(is_x86_feature_detected!("popcnt"));
// SAFETY: This module is meant to be included only under enabled popcnt.
(self.depth - unsafe { popcnt(self.closing_mask) }) as isize
(self.depth - self.closing_mask.count_ones() as i32) as isize
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@ impl DelimiterClassifierImpl128 {
Self { opening: opening as i8 }
}

#[target_feature(enable = "sse2")]
#[inline(always)]
unsafe fn opening_mask(&self) -> __m128i {
_mm_set1_epi8(self.opening)
}

#[target_feature(enable = "sse2")]
#[inline(always)]
unsafe fn closing_mask(&self) -> __m128i {
_mm_set1_epi8(self.opening + 2)
}

#[target_feature(enable = "sse2")]
#[inline]
pub(crate) unsafe fn get_opening_and_closing_masks(&self, bytes: &[u8]) -> (u16, u16) {
assert_eq!(16, bytes.len());
// SAFETY: target_feature invariant
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@ impl DelimiterClassifierImpl256 {
Self { opening: opening as i8 }
}

#[target_feature(enable = "avx2")]
#[inline(always)]
unsafe fn opening_mask(&self) -> __m256i {
_mm256_set1_epi8(self.opening)
}

#[target_feature(enable = "avx2")]
#[inline(always)]
unsafe fn closing_mask(&self) -> __m256i {
_mm256_set1_epi8(self.opening + 2)
}

#[target_feature(enable = "avx2")]
#[inline]
pub(crate) unsafe fn get_opening_and_closing_masks(&self, bytes: &[u8]) -> (u32, u32) {
assert_eq!(32, bytes.len());
// SAFETY: target_feature invariant
Expand Down
16 changes: 4 additions & 12 deletions crates/rsonpath-lib/src/classification/depth/sse2_32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,17 @@ use std::marker::PhantomData;

const SIZE: usize = 32;

shared::depth_classifier!(
Ssse3VectorIterator32,
DelimiterClassifierImpl128,
DepthVector32,
32,
u32
);
shared::depth_classifier!(Sse2VectorIterator32, DelimiterClassifierImpl128, DepthVector32, 32, u32);

#[inline]
#[inline(always)]
fn new_vector<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u32, SIZE>,
classifier: &DelimiterClassifierImpl128,
) -> DepthVector32<'a, B> {
new_vector_from(bytes, classifier, 0)
}

#[inline]
#[inline(always)]
fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u32, SIZE>,
classifier: &DelimiterClassifierImpl128,
Expand All @@ -36,9 +30,7 @@ fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
unsafe { new_sse2(bytes, classifier, idx) }
}

#[target_feature(enable = "sse2")]
#[target_feature(enable = "popcnt")]
#[inline]
#[inline(always)]
unsafe fn new_sse2<'a, B: InputBlock<'a, SIZE>>(
bytes: QuoteClassifiedBlock<B, u32, SIZE>,
classifier: &DelimiterClassifierImpl128,
Expand Down
Loading

0 comments on commit 56472ed

Please sign in to comment.