diff --git a/.vscode/settings.json b/.vscode/settings.json index 0c91f750..0f2c51e3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -14,10 +14,12 @@ "cmpeq", "codegen", "color_eyre", + "complementation", "cvtsi", "datavalue", "dealloc", "Deque", + "determinization", "docsrs", "ebnf", "Elem", diff --git a/Cargo.lock b/Cargo.lock index bb886031..ebf24a6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,9 +76,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.80" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" +checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" [[package]] name = "arbitrary" @@ -171,9 +171,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.88" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02f341c093d19155a6e41631ce5971aac4e9a868262212153124c15fa22d1cdc" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" [[package]] name = "cfg-if" @@ -183,9 +183,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.5.2" +version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" +checksum = "949626d00e063efc93b6dca932419ceb5432f99769911c0b995f7e884c778813" dependencies = [ "clap_builder", "clap_derive", @@ -206,11 +206,11 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.0" +version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" +checksum = "90239a040c80f5e14809ca132ddc4176ab33d5e17e49691793296e3fcb34d72f" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.52", @@ -224,9 +224,9 @@ checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "color-eyre" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a667583cca8c4f8436db8de46ea8233c42a7d9ae424a82d338f2e4675229204" +checksum = "55146f5e46f237f7423d74111267d4597b59b0dad0ffaf7303bce9945d843ad5" dependencies = [ "backtrace", "eyre", @@ -451,6 +451,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "humantime" version = "2.1.0" @@ -475,9 +481,9 @@ checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" [[package]] name = "indexmap" -version = "2.2.4" +version = "2.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "967d6dd42f16dbf0eb8040cb9e477933562684d3918f7d253f2ff9087fb3e7a3" +checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" dependencies = [ "equivalent", "hashbrown", @@ -685,9 +691,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] @@ -850,9 +856,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", @@ -935,7 +941,7 @@ dependencies = [ name = "rsonpath-test-codegen" version = "0.8.7" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "serde", @@ -1091,9 +1097,9 @@ checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "snapbox" -version = "0.5.7" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a99efa20de5053229642a477436cdb39828c7651c614622eb4888f9688523e6" +checksum = "8ac441e1ecf678f68423d47f376d53fabce1afba92c8f68e31508eb27df8562a" dependencies = [ "anstream", "anstyle", @@ -1563,9 +1569,9 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" [[package]] name = "winnow" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44e19b97e00a4d3db3cdb9b53c8c5f87151b5689b82cc86c2848cbdcccb2689b" +checksum = "dffa400e67ed5a4dd237983829e66475f0a4a26938c4b04c21baede6262215b8" dependencies = [ "memchr", ] diff --git a/crates/rsonpath-benchmarks b/crates/rsonpath-benchmarks index 93a0bf03..d7eb7740 160000 --- a/crates/rsonpath-benchmarks +++ b/crates/rsonpath-benchmarks @@ -1 +1 @@ -Subproject commit 93a0bf03f22f272c8d58fb1ec63f67e8da6182ca +Subproject commit d7eb7740c394ccfb202d3af4d3d15e9313f00fcd diff --git a/crates/rsonpath-lib/src/automaton.rs b/crates/rsonpath-lib/src/automaton.rs index 3aeab68f..bdbaddff 100644 --- a/crates/rsonpath-lib/src/automaton.rs +++ b/crates/rsonpath-lib/src/automaton.rs @@ -1,4 +1,5 @@ //! Automaton representations of a JSONPath query. +mod array_transition_set; pub mod error; mod minimizer; mod nfa; @@ -21,8 +22,23 @@ pub struct Automaton<'q> { /// Transition when a JSON member name matches a [`JsonString`]i. pub type MemberTransition<'q> = (&'q JsonString, State); -/// Transition on the n-th element of an array, with n specified by a [`JsonUInt`]. -pub type ArrayTransition<'q> = (JsonUInt, State); + +/// Transition on elements of an array with indices specified by either a single index +/// or a simple slice expression. +#[derive(Debug, PartialEq, Eq)] +pub struct ArrayTransition { + label: ArrayTransitionLabel, + target: State, +} + +/// Represent the distinct methods of moving on a match between states. +#[derive(Debug, Copy, PartialEq, Clone, Eq)] +pub(super) enum ArrayTransitionLabel { + /// Transition on the n-th element of an array, with n specified by a [`JsonUInt`]. + Index(JsonUInt), + /// Transition on elements of array matched by a slice expression - bounds and a step. + Slice(SimpleSlice), +} /// A transition table of a single [`State`] of an [`Automaton`]. /// @@ -32,10 +48,17 @@ pub type ArrayTransition<'q> = (JsonUInt, State); pub struct StateTable<'q> { attributes: StateAttributes, member_transitions: SmallVec<[MemberTransition<'q>; 2]>, - array_transitions: SmallVec<[ArrayTransition<'q>; 2]>, + array_transitions: SmallVec<[ArrayTransition; 2]>, fallback_state: State, } +#[derive(Debug, Copy, PartialEq, Clone, Eq)] +pub(crate) struct SimpleSlice { + start: JsonUInt, + end: Option, + step: JsonUInt, +} + impl<'q> Default for StateTable<'q> { #[inline] fn default() -> Self { @@ -76,6 +99,47 @@ impl<'q> Index for Automaton<'q> { } } +impl ArrayTransition { + pub(crate) fn new(label: ArrayTransitionLabel, target: State) -> Self { + Self { label, target } + } + + #[inline(always)] + pub(crate) fn target_state(&self) -> State { + self.target + } + + #[inline(always)] + pub(crate) fn matches(&self, index: JsonUInt) -> bool { + self.label.matches(index) + } +} + +impl ArrayTransitionLabel { + pub(crate) fn matches(&self, index: JsonUInt) -> bool { + match self { + Self::Index(i) => index.eq(i), + Self::Slice(s) => s.contains(index), + } + } +} + +impl From for ArrayTransitionLabel { + #[must_use] + #[inline(always)] + fn from(index: JsonUInt) -> Self { + Self::Index(index) + } +} + +impl From for ArrayTransitionLabel { + #[must_use] + #[inline(always)] + fn from(slice: SimpleSlice) -> Self { + Self::Slice(slice) + } +} + impl<'q> Automaton<'q> { /// Convert a [`JsonPathQuery`] into a minimal deterministic automaton. /// @@ -172,7 +236,7 @@ impl<'q> Automaton<'q> { #[must_use] #[inline(always)] pub fn has_any_array_item_transition(&self, state: State) -> bool { - self[state].attributes.has_array_index_transition() + self[state].attributes.has_array_transition() } /// Returns whether the given state is accepting the first item in a list. @@ -219,11 +283,11 @@ impl<'q> Automaton<'q> { #[inline(always)] pub fn has_array_index_transition_to_accepting(&self, state: State, match_index: &JsonUInt) -> bool { let state = &self[state]; - state.attributes.has_array_index_transition_to_accepting() + state.attributes.has_array_transition_to_accepting() && state .array_transitions() .iter() - .any(|(i, s)| i.eq(match_index) && self.is_accepting(*s)) + .any(|trans| self.is_accepting(trans.target) && trans.matches(*match_index)) } /// Returns whether the given state has any transitions @@ -303,7 +367,7 @@ impl<'q> StateTable<'q> { /// to the contained [`State`]. #[must_use] #[inline(always)] - pub fn array_transitions(&self) -> &[ArrayTransition<'q>] { + pub fn array_transitions(&self) -> &[ArrayTransition] { &self.array_transitions } @@ -318,6 +382,22 @@ impl<'q> StateTable<'q> { } } +impl Display for ArrayTransitionLabel { + #[inline(always)] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Index(index) => write!(f, "{}", index.as_u64()), + Self::Slice(slice) => { + if let Some(end) = slice.end { + write!(f, "[{}:{}:{}]", slice.start, end, slice.step) + } else { + write!(f, "[{}::{}]", slice.start, slice.step) + } + } + } + } +} + impl<'q> Display for Automaton<'q> { #[inline] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -348,8 +428,35 @@ impl<'q> Display for Automaton<'q> { } for (i, transitions) in self.states.iter().enumerate() { - for (label, state) in &transitions.array_transitions { - writeln!(f, " {i} -> {} [label=\"[{}]\"]", state.0, label.as_u64())? + for array_transition in &transitions.array_transitions { + match array_transition.label { + ArrayTransitionLabel::Index(label) => writeln!( + f, + " {i} -> {} [label=\"[{}]\"]", + array_transition.target.0, + label.as_u64() + )?, + ArrayTransitionLabel::Slice(label) => { + if let Some(end) = label.end { + writeln!( + f, + " {i} -> {} [label=\"[{}:{}:{}]\"]", + array_transition.target.0, + label.start.as_u64(), + end.as_u64(), + label.step.as_u64() + )? + } else { + writeln!( + f, + " {i} -> {} [label=\"[{}::{}]\"]", + array_transition.target.0, + label.start.as_u64(), + label.step.as_u64() + )? + } + } + } } for (label, state) in &transitions.member_transitions { writeln!(f, " {i} -> {} [label=\"{}\"]", state.0, label.unquoted())? @@ -360,3 +467,42 @@ impl<'q> Display for Automaton<'q> { Ok(()) } } + +impl SimpleSlice { + fn new(start: JsonUInt, end: Option, step: JsonUInt) -> Self { + Self { start, end, step } + } + + #[inline(always)] + #[must_use] + fn contains(&self, index: JsonUInt) -> bool { + if index < self.start { + return false; + } + let offset = index.as_u64() - self.start.as_u64(); + if let Some(end) = self.end { + index < end && offset % self.step.as_u64() == 0 + } else { + offset % self.step.as_u64() == 0 + } + } +} + +#[cfg(test)] +mod tests { + use super::SimpleSlice; + use rsonpath_syntax::num::JsonUInt; + use test_case::test_case; + + #[test_case(0.into(), None, 1.into(), 0.into() => true)] + #[test_case(2.into(), None, 1.into(), 3.into() => true)] + #[test_case(2.into(), None, 2.into(), 3.into() => false)] + #[test_case(3.into(), None, 2.into(), 3.into() => true)] + #[test_case(2.into(), None, 2.into(), 4.into() => true)] + #[test_case(2.into(), Some(6.into()), 2.into(), 2.into() => true)] + #[test_case(2.into(), Some(6.into()), 2.into(), 6.into() => false)] + fn simple_slice_containment(start: JsonUInt, end: Option, step: JsonUInt, idx: JsonUInt) -> bool { + let slice = SimpleSlice::new(start, end, step); + slice.contains(idx) + } +} diff --git a/crates/rsonpath-lib/src/automaton/array_transition_set.rs b/crates/rsonpath-lib/src/automaton/array_transition_set.rs new file mode 100644 index 00000000..83f1e94e --- /dev/null +++ b/crates/rsonpath-lib/src/automaton/array_transition_set.rs @@ -0,0 +1,409 @@ +//! Representation of linear sets of integers that capture JSONPath array index and slicing access. +//! +//! A _linear_ set is a set of integers in an arithmetic sequence: {a, a + k, a + 2k, ...}. +//! It can be either bounded or infinite. It has the general form of a:b:k, where b is the end bound. +//! These are exactly the sets that the slicing operator can express. The index selector is also a linear set +//! of form {a:a+1:1} (step doesn't matter, but 1 is chosen as canonical). +//! +//! These sets are closed under intersection, which is a crucial property. +//! This module allows manipulating a set of transitions from a single state labelled with linear sets +//! and automatic resolution of transition overlaps. +//! +//! ## Motivation +//! +//! Regular sets capture what happens during determinization of the query NFA, when multiple slice and/or index +//! selectors need to be combined. +//! +//! Consider a scenario where we have a complex transition labelled with a regular set X to some set of states S, +//! and in comes a slice selector Y supposed to transition to {t}. It is not contained within the regular set X, but +//! has a non-empty intersection. In that case we need the result to be transitions: +//! * over X-Y to S +//! * over Y-X to {t} +//! * over X intersect Y to S+{t} +//! +//! Linear sets are not closed under complementation or subtraction - the representation of X-Y might be +//! complex. Therefore, we rely on the engine to process the transitions from first to last and during compilation +//! maintain the correct order. The order is enforced via **priorities**. To represent the case above, we emit the +//! following prioritized transitions: +//! * {prio. 2} over X intersect Y to S+{t} +//! * {prio. 1} over X to S +//! * {prio. 1} over Y to {t} +//! The semantics are correct as long as the transitions are taken in non-increasing order of priorities. +//! +//! Intersection of two linear sets is always a linear set. Finding such intersection is not trivial, +//! but doable. One needs to solve a linear congruence to find the smallest common element of the two sets, +//! if one exists, and then step by the least common multiple of the step values. +//! +//! ## Optimizations +//! +//! 1. This module automatically optimizes a few cases. A linear set is always represented canonically +//! – empty sets are not relevant, so such transitions are not created; sets containing a single element +//! are always represented as a singleton. +//! +//! 2. Engine runtime depends on the number of transitions it needs to process, so emitting superficial transitions +//! is discouraged. In the case where the overlap of X and Y is X (or Y), it suffices to emit only X intersect Y and Y (or X) +//! – the third transition would have never been taken anyway. Moreover, if X = Y then only one transition needs to be emitted. +//! This check is not perfect, as a transition can be dominated by two other transitions piecewise ((X \cap Y) \cup (X \ cap Z) = X), +//! but it does help reduce transitions, especially in cases where X is a singleton. + +use super::{ + small_set::{SmallSet, SmallSet256}, + ArrayTransitionLabel, SimpleSlice, +}; +use rsonpath_syntax::num::JsonUInt; +use std::collections::HashMap; + +#[derive(Debug)] +pub(super) struct ArrayTransitionSet { + transitions: HashMap, +} + +#[derive(Debug)] +struct LinearSetTransition { + priority: usize, + target: SmallSet256, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +enum LinearSet { + Singleton(JsonUInt), + BoundedSlice(JsonUInt, JsonUInt, JsonUInt), + OpenEndedSlice(JsonUInt, JsonUInt), +} + +pub(super) struct ArrayTransitionSetIterator { + transitions: std::vec::IntoIter<(LinearSet, LinearSetTransition)>, +} + +impl ArrayTransitionSet { + pub(super) fn new() -> Self { + Self { + transitions: HashMap::new(), + } + } + + pub(super) fn add_transition(&mut self, label: ArrayTransitionLabel, target: SmallSet256) { + use std::collections::hash_map::Entry; + let Some(label) = LinearSet::from_label(label) else { + return; + }; + let overlaps: Vec<_> = self + .transitions + .iter() + .filter_map(|(other, trans)| { + let overlap = other.overlap_with(&label)?; + let priority = trans.priority + 1; + let mut overlap_target = target; + overlap_target.union(&trans.target); + + Some((overlap, LinearSetTransition { priority, target })) + }) + .collect(); + + for (label, trans) in overlaps { + match self.transitions.entry(label) { + Entry::Occupied(mut entry) => { + let entry = entry.get_mut(); + entry.priority = std::cmp::max(entry.priority, trans.priority); + entry.target.union(&trans.target); + } + Entry::Vacant(entry) => { + entry.insert(trans); + } + } + } + + match self.transitions.entry(label) { + // Label overlapped (entirely) with some existing label, so it is already handled. + Entry::Occupied(_) => (), + Entry::Vacant(entry) => { + entry.insert(LinearSetTransition { priority: 1, target }); + } + } + } + + pub(super) fn states_mut(&mut self) -> impl Iterator { + self.transitions.iter_mut().map(|(_, trans)| &mut trans.target) + } +} + +impl ArrayTransitionSetIterator { + fn new(mut transitions: Vec<(LinearSet, LinearSetTransition)>) -> Self { + transitions.sort_by(|(_, x), (_, y)| x.priority.cmp(&y.priority).reverse()); + Self { + transitions: transitions.into_iter(), + } + } +} + +impl IntoIterator for ArrayTransitionSet { + type Item = (ArrayTransitionLabel, SmallSet256); + + type IntoIter = ArrayTransitionSetIterator; + + fn into_iter(self) -> Self::IntoIter { + ArrayTransitionSetIterator::new(self.transitions.into_iter().collect()) + } +} + +impl Iterator for ArrayTransitionSetIterator { + type Item = (ArrayTransitionLabel, SmallSet256); + + fn next(&mut self) -> Option { + let (label, transition) = self.transitions.next()?; + Some(match label { + LinearSet::Singleton(idx) => (ArrayTransitionLabel::Index(idx), transition.target), + LinearSet::BoundedSlice(start, end, step) => ( + ArrayTransitionLabel::Slice(SimpleSlice::new(start, Some(end), step)), + transition.target, + ), + LinearSet::OpenEndedSlice(start, step) => ( + ArrayTransitionLabel::Slice(SimpleSlice::new(start, None, step)), + transition.target, + ), + }) + } +} + +impl LinearSet { + fn from_label(label: ArrayTransitionLabel) -> Option { + match label { + ArrayTransitionLabel::Index(idx) => Some(Self::Singleton(idx)), + ArrayTransitionLabel::Slice(slice) => { + if slice.step == JsonUInt::ZERO { + None + } else if let Some(end) = slice.end { + if slice.start >= end { + None + } else if slice.start.as_u64().saturating_add(slice.step.as_u64()) >= end.as_u64() { + // Only one item within the slice. + Some(Self::Singleton(slice.start)) + } else { + debug_assert!(end > JsonUInt::ZERO); + Some(Self::BoundedSlice(slice.start, end, slice.step)) + } + } else { + Some(Self::OpenEndedSlice(slice.start, slice.step)) + } + } + } + } + + fn from_slice(start: JsonUInt, end: Option, step: JsonUInt) -> Option { + if step == JsonUInt::ZERO { + None + } else if let Some(end) = end { + if start >= end { + None + } else if start.as_u64().saturating_add(step.as_u64()) >= end.as_u64() { + // Only one item within the slice. + Some(Self::Singleton(start)) + } else { + debug_assert!(end > JsonUInt::ZERO); + Some(Self::BoundedSlice(start, end, step)) + } + } else { + Some(Self::OpenEndedSlice(start, step)) + } + } + + fn overlap_with(&self, other: &Self) -> Option { + // Assume the first set starts not-later, otherwise flip. + if self.start() > other.start() { + return other.overlap_with(self); + } + // Empty sets are discarded on construction. + assert_ne!(self.step().as_u64(), 0); + assert_ne!(other.step().as_u64(), 0); + + // First we take both sets as if they are open-ended and linear. + // We can take an overlap under that assumption and then simply apply the lower of the two end constraints, + // if any, to obtain the ultimate result. + // + // If first_element is beyond the range of JsonUInt it will fail conversion at the end of this function, + // and result in an empty set (empty transition = no transition). This is correct behavior - first element + // out of bounds means there are no valid elements. + let (first_element, gcd) = find_first_element( + self.start().into(), + self.step().into(), + other.start().into(), + other.step().into(), + )?; + // Perform the min of ends where None is treated as larger than everything. + let end = match (self.end_exclusive(), other.end_exclusive()) { + (None, Some(x)) | (Some(x), None) => Some(x), + (None, None) => None, + (Some(x), Some(y)) => Some(std::cmp::min(x, y)), + }; + // This can also overflow both JsonUInt and u64. We saturate and then convert to JsonUInt. + // A step that fails this conversion is essentially infinite, which means we need to emit a set containing only the + // first_element. + let common_step = (self.step().as_u64() / gcd).saturating_mul(other.step().as_u64()); + + let start = JsonUInt::try_from(first_element).ok()?; + + return match JsonUInt::try_from(common_step).ok() { + Some(step) => Self::from_slice(start, end, step), + None if end.map_or(false, |end| end <= start) => None, + None => Some(Self::Singleton(start)), + }; + + fn find_first_element(a: i64, k: i64, b: i64, l: i64) -> Option<(i64, u64)> { + // Now we have two sets, S1=[a::k] and S2=[b::l], a <= b. + // Clearly b \in S2 and every +l step is in S2. + // Now the difference between b and the next element of S1 is given by: + // c = k - (b - a) mod k + // (note that this can be zero if b-a is a multiple of k, which makes sense) + // + // To get a common element we need to apply +l steps until we land in S1. + // We get the following equation: + // c + lx = 0 mod k + // or + // lx = -c mod k + // + // This is a linear congruence which has a known algorithm using extended Euclid. + let c = umod(k - (b - a), k); + let (jumps, gcd) = solve_linear_congruence(l, c, k)?; + Some((jumps.checked_mul(l)?.checked_add(b)?, gcd)) + } + } + + fn start(&self) -> JsonUInt { + match self { + Self::Singleton(i) | Self::BoundedSlice(i, _, _) | Self::OpenEndedSlice(i, _) => *i, + } + } + + fn end_exclusive(&self) -> Option { + match self { + Self::Singleton(i) => JsonUInt::try_from(i.as_u64() + 1).ok(), + Self::BoundedSlice(_, i, _) => Some(*i), + Self::OpenEndedSlice(_, _) => None, + } + } + + fn step(&self) -> JsonUInt { + match self { + Self::Singleton(_) => JsonUInt::ONE, + Self::BoundedSlice(_, _, s) | Self::OpenEndedSlice(_, s) => *s, + } + } +} + +/// Unsigned modulo, a.k.a. proper mathematical modulo. +/// Returns the unique number k such that +/// x === k mod m AND 0 <= k < m +/// m must be positive. +fn umod(x: i64, m: i64) -> i64 { + assert!(m > 0); + let k = x % m; + if k < 0 { + m + k + } else { + k + } +} + +/// Solve ax = b mod m. +/// If any solution exists, returns the smallest solution and the unique gcd(a, m). +fn solve_linear_congruence(a: i64, b: i64, m: i64) -> Option<(i64, u64)> { + // If gcd(a, m) does not divide b mod m, then there are no solutions. + // Otherwise, find the (x,y) that solve ax - my = gcd(a, m) + // and take x*(b/gcd(a,m)) mod (m/gcd(a,m)) as the solution. + // + // Note that there may be multiple solutions if gcd(a, m) > 1, + // but this always gives the smallest one. + let b = umod(b, m); + let (x, gcd) = extended_euclid(a, m); + + if b % gcd != 0 { + None + } else { + Some(( + umod(x.checked_mul(b / gcd)?, m / gcd), + u64::try_from(gcd).expect("negative gcd"), + )) + } +} + +/// Only x and gcd is returned. +fn extended_euclid(a: i64, b: i64) -> (i64, i64) { + let (mut old_r, mut r) = (a, b); + let (mut old_x, mut x) = (1, 0); + + while r != 0 { + let quotient = old_r / r; + (old_r, r) = (r, old_r - quotient * r); + (old_x, x) = (x, old_x - quotient * x); + } + + (old_x, old_r) +} + +#[cfg(test)] +mod tests { + use test_case::test_case; + + use super::LinearSet; + + #[test_case(1, 1 => (0, 1))] + #[test_case(4, 10 => (-2, 2))] + #[test_case(7, 10 => (3, 1))] + #[test_case(8, 10 => (-1, 2))] + #[test_case(161, 28 => (-1, 7))] + fn extended_euclid_tests(a: i64, b: i64) -> (i64, i64) { + super::extended_euclid(a, b) + } + + #[test_case(7, 3, 10 => Some((9, 1)))] + #[test_case(7, 8, 10 => Some((4, 1)))] + #[test_case(8, 3, 10 => None)] + #[test_case(8, 2, 10 => Some((4, 2)))] + #[test_case(94_253_004_627_829, 666_084_837_845, 888_777_666_555_119 => Some((2_412_193, 121_216_531)))] + #[test_case(6_253_004_621, 2_156_208_490, 27_815_089_521 => Some((116, 215_620_849)))] + fn linear_congruence_tests(a: i64, b: i64, m: i64) -> Option<(i64, u64)> { + super::solve_linear_congruence(a, b, m) + } + + #[test_case(LinearSet::Singleton(1.into()), LinearSet::Singleton(1.into()) => Some(LinearSet::Singleton(1.into())))] + #[test_case(LinearSet::Singleton(1.into()), LinearSet::Singleton(2.into()) => None)] + #[test_case( + LinearSet::Singleton(3.into()), + LinearSet::BoundedSlice(3.into(), 15.into(), 2.into()) + => Some(LinearSet::Singleton(3.into())))] + #[test_case( + LinearSet::Singleton(5.into()), + LinearSet::BoundedSlice(3.into(), 15.into(), 2.into()) + => Some(LinearSet::Singleton(5.into())))] + #[test_case( + LinearSet::Singleton(15.into()), + LinearSet::BoundedSlice(3.into(), 15.into(), 2.into()) + => None)] + #[test_case( + LinearSet::BoundedSlice(3.into(), 15.into(), 2.into()), + LinearSet::BoundedSlice(3.into(), 15.into(), 2.into()) + => Some(LinearSet::BoundedSlice(3.into(), 15.into(), 2.into())))] + #[test_case( + LinearSet::BoundedSlice(5.into(), 1024.into(), 7.into()), + LinearSet::BoundedSlice(3.into(), 911.into(), 10.into()) + => Some(LinearSet::BoundedSlice(33.into(), 911.into(), 70.into())))] + #[test_case( + LinearSet::OpenEndedSlice(5.into(), 7.into()), + LinearSet::OpenEndedSlice(3.into(), 10.into()) + => Some(LinearSet::OpenEndedSlice(33.into(), 70.into())))] + #[test_case( + LinearSet::OpenEndedSlice(5.into(), 8.into()), + LinearSet::OpenEndedSlice(3.into(), 10.into()) + => Some(LinearSet::OpenEndedSlice(13.into(), 40.into())))] + #[test_case( + LinearSet::OpenEndedSlice(156_208_490.try_into().unwrap(), 6_253_004_621_u64.try_into().unwrap()), + LinearSet::OpenEndedSlice(4_253_004_621_u64.try_into().unwrap(), 27_815_089_521_u64.try_into().unwrap()) + => Some(LinearSet::OpenEndedSlice(87_698_273_184_u64.try_into().unwrap(), 806_637_596_109_u64.try_into().unwrap())))] + #[test_case( + LinearSet::OpenEndedSlice(666_123_456_789_u64.try_into().unwrap(), 888_777_666_555_119_u64.try_into().unwrap()), + LinearSet::OpenEndedSlice(888_777_705_174_063_u64.try_into().unwrap(), 94_253_004_627_829_u64.try_into().unwrap()) + => None)] + fn overlap_tests(a: LinearSet, b: LinearSet) -> Option { + a.overlap_with(&b) + } +} diff --git a/crates/rsonpath-lib/src/automaton/minimizer.rs b/crates/rsonpath-lib/src/automaton/minimizer.rs index d4e83d65..ace0d8fc 100644 --- a/crates/rsonpath-lib/src/automaton/minimizer.rs +++ b/crates/rsonpath-lib/src/automaton/minimizer.rs @@ -3,14 +3,15 @@ // NOTE: Some comments in this module are outdated, because the minimizer doesn't // actually produce minimal automata as of now - see #91. use super::{ + array_transition_set::ArrayTransitionSet, error::CompilerError, nfa::{self, NfaState, NfaStateId}, small_set::{SmallSet, SmallSet256}, state::StateAttributesBuilder, - {Automaton, NondeterministicAutomaton, State as DfaStateId, StateAttributes, StateTable}, + Automaton, NondeterministicAutomaton, State as DfaStateId, StateAttributes, StateTable, }; -use crate::debug; -use rsonpath_syntax::{num::JsonUInt, str::JsonString}; +use crate::{automaton::ArrayTransition, debug}; +use rsonpath_syntax::str::JsonString; use smallvec::{smallvec, SmallVec}; use vector_map::VecMap; @@ -47,7 +48,8 @@ pub(super) struct Minimizer<'q> { #[derive(Debug)] struct SuperstateTransitionTable<'q> { - labelled: VecMap, SmallSet256>, + array: ArrayTransitionSet, + member: VecMap<&'q JsonString, SmallSet256>, wildcard: SmallSet256, } @@ -140,16 +142,16 @@ impl<'q> Minimizer<'q> { debug!("Normalized transitions: {:?}", transitions); // Translate the transitions to the data model expected by TransitionTable. - let mut array_transitions = smallvec![]; - let mut member_transitions = smallvec![]; - - for (label, state) in transitions.labelled { - let state = self.superstates[&state]; - match label { - nfa::TransitionLabel::ArrayIndex(i) => array_transitions.push((i, state)), - nfa::TransitionLabel::ObjectMember(s) => member_transitions.push((s, state)), - } - } + let array_transitions = transitions + .array + .into_iter() + .map(|(label, state)| ArrayTransition::new(label, self.superstates[&state])) + .collect::>(); + let member_transitions = transitions + .member + .into_iter() + .map(|(label, state)| (label, self.superstates[&state])) + .collect::>(); debug!("Translated transitions (array): {array_transitions:?}"); debug!("Translated transitions (member): {member_transitions:?}"); @@ -172,7 +174,7 @@ impl<'q> Minimizer<'q> { fn build_attributes( &self, id: DfaStateId, - array_transitions: &[(JsonUInt, DfaStateId)], + array_transitions: &[ArrayTransition], member_transitions: &[(&JsonString, DfaStateId)], fallback: DfaStateId, ) -> StateAttributes { @@ -191,7 +193,9 @@ impl<'q> Minimizer<'q> { attrs = attrs.unitary(); } if self.accepting.contains(fallback.0) - || array_transitions.iter().any(|(_, s)| self.accepting.contains(s.0)) + || array_transitions + .iter() + .any(|x| self.accepting.contains(x.target_state().0)) || member_transitions.iter().any(|(_, s)| self.accepting.contains(s.0)) { debug!("{id} has transitions to accepting"); @@ -199,11 +203,14 @@ impl<'q> Minimizer<'q> { } if !array_transitions.is_empty() { debug!("{id} has an array index transition"); - attrs = attrs.has_array_index_transition(); + attrs = attrs.has_array_transition(); } - if array_transitions.iter().any(|(_, s)| self.accepting.contains(s.0)) { + if array_transitions + .iter() + .any(|x| self.accepting.contains(x.target_state().0)) + { debug!("{id} has an accepting array index transition"); - attrs = attrs.has_array_index_transition_to_accepting(); + attrs = attrs.has_array_transition_to_accepting(); } attrs.into() @@ -257,7 +264,8 @@ impl<'q> Minimizer<'q> { debug!("Wildcard target: {wildcard_targets:?}"); let mut transitions = SuperstateTransitionTable { - labelled: VecMap::new(), + array: ArrayTransitionSet::new(), + member: VecMap::new(), wildcard: wildcard_targets, }; @@ -266,19 +274,55 @@ impl<'q> Minimizer<'q> { // Direct states simply have a single transition to the next state in the NFA. // Recursive transitions also have a self-loop, but that is handled by the // checkpoints mechanism - here we only handle the forward transition. - NfaState::Direct(nfa::Transition::Labelled(label)) - | NfaState::Recursive(nfa::Transition::Labelled(label)) => { - debug!("Considering transition {nfa_state} --{}-> {}", label, nfa_state.next()?,); + NfaState::Direct(nfa::Transition::Member(label)) + | NfaState::Recursive(nfa::Transition::Member(label)) => { + debug!( + "Considering member transition {nfa_state} --{}-> {}", + label.unquoted(), + nfa_state.next()?, + ); // Add the target NFA state to the target superstate, or create a singleton // set if this is the first transition via this label encountered in the loop. - if let Some(target) = transitions.labelled.get_mut(&label) { + if let Some(target) = transitions.member.get_mut(&label) { target.insert(nfa_state.next()?.0); } else { let mut new_set = transitions.wildcard; new_set.insert(nfa_state.next()?.0); - transitions.labelled.insert(label, new_set); + transitions.member.insert(label, new_set); } } + NfaState::Direct(nfa::Transition::Array(label)) + | NfaState::Recursive(nfa::Transition::Array(label)) => { + // Array transitions are trickier, as they can have overlap. For example, + // a transition over [5] overlaps with a transition over [3::2]. + // If the incoming transition does not overlap with anything then it's easy and analogous + // to the member case - create a new singleton set with a single transition. + // Otherwise we need to solve conflicts with - potentially many! - existing transitions. + // Fortunately, the conflicts can be resolved one at a time. + // Assume we're processing --t1--> {s1} and there already is a --t2-->S2 (where S2 is a superstate), + // such that t1 overlaps with t2 (overlap(t1, t2) = t3). + // The resolution is to have the following transitions: + // --t3--> S2+{s1} + // --(t1-t3)--> {s1} + // --(t2-t3)--> S2 + // If t1 and t2 are slices then t3 is easy to compute and is also a slice. + // This is not the case for (t1-t3) or (t2-t3). Turns out this is actually a hard problem to solve. + // We can do away with a trick, however. As long as the engine always processes transitions in order + // and takes the first one that matches, it is enough for the procedure here to emit + // --t3--> S2+{s1} + // --t1--> {s1} + // --t2--> S2 + // and make sure the transition over t3 is put before the other two. + // The ArrayTransitionTable does that by assigning priorities to transitions and sorting them accordingly. + debug!( + "Considering array transition {nfa_state} --{}-> {}", + label, + nfa_state.next()?, + ); + let mut new_set = transitions.wildcard; + new_set.insert(nfa_state.next()?.0); + transitions.array.add_transition(label, new_set); + } NfaState::Direct(nfa::Transition::Wildcard) | NfaState::Recursive(nfa::Transition::Wildcard) | NfaState::Accepting => (), @@ -315,7 +359,10 @@ impl<'q> Minimizer<'q> { } normalize_one(self, &mut transitions.wildcard, current_checkpoint)?; - for (_, state) in &mut transitions.labelled { + for (_, state) in &mut transitions.member { + normalize_one(self, state, current_checkpoint)?; + } + for state in &mut transitions.array.states_mut() { normalize_one(self, state, current_checkpoint)?; } @@ -352,7 +399,8 @@ mod tests { ordered_states: vec![NfaState::Accepting], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -380,7 +428,8 @@ mod tests { ordered_states: vec![NfaState::Direct(nfa::Transition::Wildcard), NfaState::Accepting], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -414,12 +463,13 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Direct(nfa::Transition::Labelled(label.into())), + NfaState::Direct(nfa::Transition::Array(label.into())), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -429,13 +479,13 @@ mod tests { attributes: StateAttributes::REJECTING, }, StateTable { - array_transitions: smallvec![(label, State(2))], + array_transitions: smallvec![ArrayTransition::new(ArrayTransitionLabel::Index(label), State(2))], member_transitions: smallvec![], fallback_state: State(0), attributes: StateAttributes::UNITARY | StateAttributes::TRANSITIONS_TO_ACCEPTING - | StateAttributes::HAS_ARRAY_INDEX_TRANSITION - | StateAttributes::HAS_ARRAY_INDEX_TRANSITION_TO_ACCEPTING, + | StateAttributes::HAS_ARRAY_TRANSITION + | StateAttributes::HAS_ARRAY_TRANSITION_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], @@ -456,7 +506,8 @@ mod tests { ordered_states: vec![NfaState::Recursive(nfa::Transition::Wildcard), NfaState::Accepting], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -491,16 +542,17 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Recursive(nfa::Transition::Labelled((&label_a).into())), - NfaState::Direct(nfa::Transition::Labelled((&label_b).into())), + NfaState::Recursive(nfa::Transition::Member(&label_a)), + NfaState::Direct(nfa::Transition::Member(&label_b)), NfaState::Recursive(nfa::Transition::Wildcard), - NfaState::Direct(nfa::Transition::Labelled((&label_a).into())), - NfaState::Recursive(nfa::Transition::Labelled((&label_b).into())), + NfaState::Direct(nfa::Transition::Member(&label_a)), + NfaState::Recursive(nfa::Transition::Member(&label_b)), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -555,21 +607,21 @@ mod tests { fn interstitial_nondescendant_wildcard() { // Query = $..a.b.*.a..b let label_a = JsonString::new("a"); - let label_b = JsonString::new("b"); let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Recursive(nfa::Transition::Labelled((&label_a).into())), - NfaState::Direct(nfa::Transition::Labelled((&label_b).into())), + NfaState::Recursive(nfa::Transition::Member(&label_a)), + NfaState::Direct(nfa::Transition::Member(&label_b)), NfaState::Direct(nfa::Transition::Wildcard), - NfaState::Direct(nfa::Transition::Labelled((&label_a).into())), - NfaState::Recursive(nfa::Transition::Labelled((&label_b).into())), + NfaState::Direct(nfa::Transition::Member(&label_a)), + NfaState::Recursive(nfa::Transition::Member(&label_b)), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -592,33 +644,33 @@ mod tests { }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_a, State(5))], - fallback_state: State(4), + member_transitions: smallvec![(&label_a, State(4))], + fallback_state: State(7), attributes: StateAttributes::EMPTY, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_a, State(6))], + member_transitions: smallvec![(&label_a, State(5)), (&label_b, State(3))], fallback_state: State(1), attributes: StateAttributes::EMPTY, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_a, State(6)), (&label_b, State(3))], - fallback_state: State(1), - attributes: StateAttributes::EMPTY, + member_transitions: smallvec![(&label_b, State(6))], + fallback_state: State(5), + attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_b, State(7))], - fallback_state: State(6), - attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, + member_transitions: smallvec![(&label_b, State(6))], + fallback_state: State(5), + attributes: StateAttributes::ACCEPTING | StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_b, State(7))], - fallback_state: State(6), - attributes: StateAttributes::ACCEPTING | StateAttributes::TRANSITIONS_TO_ACCEPTING, + member_transitions: smallvec![(&label_a, State(5))], + fallback_state: State(1), + attributes: StateAttributes::EMPTY, }, ], }; @@ -633,13 +685,14 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Recursive(nfa::Transition::Labelled((&label).into())), + NfaState::Recursive(nfa::Transition::Member(&label)), NfaState::Direct(nfa::Transition::Wildcard), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -656,21 +709,21 @@ mod tests { }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(4))], - fallback_state: State(3), + member_transitions: smallvec![(&label, State(3))], + fallback_state: State(4), attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(2))], - fallback_state: State(1), - attributes: StateAttributes::ACCEPTING, + member_transitions: smallvec![(&label, State(3))], + fallback_state: State(4), + attributes: StateAttributes::ACCEPTING | StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(4))], - fallback_state: State(3), - attributes: StateAttributes::ACCEPTING | StateAttributes::TRANSITIONS_TO_ACCEPTING, + member_transitions: smallvec![(&label, State(2))], + fallback_state: State(1), + attributes: StateAttributes::ACCEPTING, }, ], }; @@ -685,12 +738,13 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Recursive(nfa::Transition::Labelled(label.into())), + NfaState::Recursive(nfa::Transition::Array(label.into())), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -700,20 +754,20 @@ mod tests { attributes: StateAttributes::REJECTING, }, StateTable { - array_transitions: smallvec![(label, State(2)),], + array_transitions: smallvec![ArrayTransition::new(ArrayTransitionLabel::Index(label), State(2)),], member_transitions: smallvec![], fallback_state: State(1), attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING - | StateAttributes::HAS_ARRAY_INDEX_TRANSITION - | StateAttributes::HAS_ARRAY_INDEX_TRANSITION_TO_ACCEPTING, + | StateAttributes::HAS_ARRAY_TRANSITION + | StateAttributes::HAS_ARRAY_TRANSITION_TO_ACCEPTING, }, StateTable { - array_transitions: smallvec![(label, State(2))], + array_transitions: smallvec![ArrayTransition::new(ArrayTransitionLabel::Index(label), State(2))], member_transitions: smallvec![], fallback_state: State(1), attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING - | StateAttributes::HAS_ARRAY_INDEX_TRANSITION - | StateAttributes::HAS_ARRAY_INDEX_TRANSITION_TO_ACCEPTING + | StateAttributes::HAS_ARRAY_TRANSITION + | StateAttributes::HAS_ARRAY_TRANSITION_TO_ACCEPTING | StateAttributes::ACCEPTING, }, ], @@ -729,7 +783,7 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Direct(nfa::Transition::Labelled((&label).into())), + NfaState::Direct(nfa::Transition::Member(&label)), NfaState::Direct(nfa::Transition::Wildcard), NfaState::Direct(nfa::Transition::Wildcard), NfaState::Direct(nfa::Transition::Wildcard), @@ -737,7 +791,8 @@ mod tests { ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -789,14 +844,15 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Recursive(nfa::Transition::Labelled((&label).into())), + NfaState::Recursive(nfa::Transition::Member(&label)), NfaState::Direct(nfa::Transition::Wildcard), NfaState::Direct(nfa::Transition::Wildcard), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -813,44 +869,44 @@ mod tests { }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(4))], - fallback_state: State(3), - attributes: StateAttributes::EMPTY, - }, - StateTable { - array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(8))], + member_transitions: smallvec![(&label, State(3))], fallback_state: State(7), - attributes: StateAttributes::EMPTY | StateAttributes::TRANSITIONS_TO_ACCEPTING, + attributes: StateAttributes::EMPTY, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(6))], + member_transitions: smallvec![(&label, State(4))], fallback_state: State(5), - attributes: StateAttributes::EMPTY | StateAttributes::TRANSITIONS_TO_ACCEPTING, + attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(8))], - fallback_state: State(7), + member_transitions: smallvec![(&label, State(4))], + fallback_state: State(5), attributes: StateAttributes::ACCEPTING | StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], member_transitions: smallvec![(&label, State(6))], - fallback_state: State(5), + fallback_state: State(8), attributes: StateAttributes::ACCEPTING | StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(2))], - fallback_state: State(1), + member_transitions: smallvec![(&label, State(3))], + fallback_state: State(7), attributes: StateAttributes::ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label, State(4))], - fallback_state: State(3), + member_transitions: smallvec![(&label, State(6))], + fallback_state: State(8), + attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, + }, + StateTable { + array_transitions: smallvec![], + member_transitions: smallvec![(&label, State(2))], + fallback_state: State(1), attributes: StateAttributes::ACCEPTING, }, ], @@ -870,18 +926,19 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Direct(nfa::Transition::Labelled((&label_x).into())), - NfaState::Recursive(nfa::Transition::Labelled((&label_a).into())), - NfaState::Direct(nfa::Transition::Labelled((&label_b).into())), - NfaState::Direct(nfa::Transition::Labelled((&label_a).into())), - NfaState::Direct(nfa::Transition::Labelled((&label_b).into())), - NfaState::Direct(nfa::Transition::Labelled((&label_c).into())), - NfaState::Recursive(nfa::Transition::Labelled((&label_d).into())), + NfaState::Direct(nfa::Transition::Member(&label_x)), + NfaState::Recursive(nfa::Transition::Member(&label_a)), + NfaState::Direct(nfa::Transition::Member(&label_b)), + NfaState::Direct(nfa::Transition::Member(&label_a)), + NfaState::Direct(nfa::Transition::Member(&label_b)), + NfaState::Direct(nfa::Transition::Member(&label_c)), + NfaState::Recursive(nfa::Transition::Member(&label_d)), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -953,16 +1010,17 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Direct(nfa::Transition::Labelled((&label_x).into())), + NfaState::Direct(nfa::Transition::Member(&label_x)), NfaState::Direct(nfa::Transition::Wildcard), - NfaState::Recursive(nfa::Transition::Labelled((&label_a).into())), + NfaState::Recursive(nfa::Transition::Member(&label_a)), NfaState::Direct(nfa::Transition::Wildcard), - NfaState::Direct(nfa::Transition::Labelled((&label_b).into())), + NfaState::Direct(nfa::Transition::Member(&label_b)), NfaState::Accepting, ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); let expected = Automaton { states: vec![ StateTable { @@ -991,25 +1049,19 @@ mod tests { }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_a, State(6))], - fallback_state: State(5), + member_transitions: smallvec![(&label_a, State(5))], + fallback_state: State(8), attributes: StateAttributes::EMPTY, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_a, State(4)), (&label_b, State(8))], - fallback_state: State(3), + member_transitions: smallvec![(&label_a, State(5)), (&label_b, State(6))], + fallback_state: State(8), attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, }, StateTable { array_transitions: smallvec![], - member_transitions: smallvec![(&label_a, State(6)), (&label_b, State(7))], - fallback_state: State(5), - attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, - }, - StateTable { - array_transitions: smallvec![], - member_transitions: smallvec![(&label_a, State(4)), (&label_b, State(8))], + member_transitions: smallvec![(&label_a, State(4)), (&label_b, State(7))], fallback_state: State(3), attributes: StateAttributes::ACCEPTING | StateAttributes::TRANSITIONS_TO_ACCEPTING, }, @@ -1019,6 +1071,12 @@ mod tests { fallback_state: State(3), attributes: StateAttributes::ACCEPTING, }, + StateTable { + array_transitions: smallvec![], + member_transitions: smallvec![(&label_a, State(4)), (&label_b, State(7))], + fallback_state: State(3), + attributes: StateAttributes::TRANSITIONS_TO_ACCEPTING, + }, ], }; @@ -1035,10 +1093,10 @@ mod tests { let nfa = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Direct(nfa::Transition::Labelled((&label_a).into())), - NfaState::Direct(nfa::Transition::Labelled((&label_b).into())), - NfaState::Recursive(nfa::Transition::Labelled((&label_c).into())), - NfaState::Recursive(nfa::Transition::Labelled((&label_d).into())), + NfaState::Direct(nfa::Transition::Member(&label_a)), + NfaState::Direct(nfa::Transition::Member(&label_b)), + NfaState::Recursive(nfa::Transition::Member(&label_c)), + NfaState::Recursive(nfa::Transition::Member(&label_d)), NfaState::Direct(nfa::Transition::Wildcard), NfaState::Recursive(nfa::Transition::Wildcard), NfaState::Accepting, @@ -1097,8 +1155,152 @@ mod tests { ], }; - let result = minimize(nfa).unwrap(); + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); assert_eq!(result, expected); } + + #[test] + fn array_index_and_slice_combo() { + // Query = $..[3][3::2][3:5:] + // These overlap, but only on index 3. + let label_3 = JsonUInt::from(3); + let label_3_2 = SimpleSlice::new(3.into(), None, 2.into()); + let label_3_5 = SimpleSlice::new(3.into(), Some(5.into()), 1.into()); + + let nfa = NondeterministicAutomaton { + ordered_states: vec![ + NfaState::Recursive(nfa::Transition::Array(label_3.into())), + NfaState::Direct(nfa::Transition::Array(label_3_2.into())), + NfaState::Direct(nfa::Transition::Array(label_3_5.into())), + NfaState::Accepting, + ], + }; + + let mut result = minimize(nfa).unwrap(); + make_canonical(&mut result); + let expected = Automaton { + states: vec![ + StateTable { + array_transitions: smallvec![], + member_transitions: smallvec![], + fallback_state: State(0), + attributes: StateAttributes::REJECTING, + }, + StateTable { + array_transitions: smallvec![ArrayTransition::new(ArrayTransitionLabel::Index(label_3), State(2)),], + member_transitions: smallvec![], + fallback_state: State(1), + attributes: StateAttributes::HAS_ARRAY_TRANSITION, + }, + StateTable { + array_transitions: smallvec![ + ArrayTransition::new(ArrayTransitionLabel::Index(label_3), State(6)), + ArrayTransition::new(ArrayTransitionLabel::Slice(label_3_2), State(3)) + ], + member_transitions: smallvec![], + fallback_state: State(1), + attributes: StateAttributes::HAS_ARRAY_TRANSITION, + }, + StateTable { + array_transitions: smallvec![ + ArrayTransition::new(ArrayTransitionLabel::Index(label_3), State(5)), + ArrayTransition::new(ArrayTransitionLabel::Slice(label_3_5), State(4)), + ], + member_transitions: smallvec![], + fallback_state: State(1), + attributes: StateAttributes::HAS_ARRAY_TRANSITION + | StateAttributes::TRANSITIONS_TO_ACCEPTING + | StateAttributes::HAS_ARRAY_TRANSITION_TO_ACCEPTING, + }, + StateTable { + array_transitions: smallvec![ArrayTransition::new(ArrayTransitionLabel::Index(label_3), State(2)),], + member_transitions: smallvec![], + fallback_state: State(1), + attributes: StateAttributes::HAS_ARRAY_TRANSITION | StateAttributes::ACCEPTING, + }, + StateTable { + array_transitions: smallvec![ + ArrayTransition::new(ArrayTransitionLabel::Index(label_3), State(6)), + ArrayTransition::new(ArrayTransitionLabel::Slice(label_3_2), State(3)), + ], + member_transitions: smallvec![], + fallback_state: State(1), + attributes: StateAttributes::HAS_ARRAY_TRANSITION | StateAttributes::ACCEPTING, + }, + StateTable { + array_transitions: smallvec![ + ArrayTransition::new(ArrayTransitionLabel::Index(label_3), State(7)), + ArrayTransition::new(ArrayTransitionLabel::Slice(label_3_2), State(3)), + ArrayTransition::new(ArrayTransitionLabel::Slice(label_3_5), State(4)), + ], + member_transitions: smallvec![], + fallback_state: State(1), + attributes: StateAttributes::HAS_ARRAY_TRANSITION + | StateAttributes::TRANSITIONS_TO_ACCEPTING + | StateAttributes::HAS_ARRAY_TRANSITION_TO_ACCEPTING, + }, + StateTable { + array_transitions: smallvec![ + ArrayTransition::new(ArrayTransitionLabel::Index(label_3), State(7)), + ArrayTransition::new(ArrayTransitionLabel::Slice(label_3_2), State(3)), + ArrayTransition::new(ArrayTransitionLabel::Slice(label_3_5), State(4)), + ], + member_transitions: smallvec![], + fallback_state: State(1), + attributes: StateAttributes::HAS_ARRAY_TRANSITION + | StateAttributes::TRANSITIONS_TO_ACCEPTING + | StateAttributes::HAS_ARRAY_TRANSITION_TO_ACCEPTING + | StateAttributes::ACCEPTING, + }, + ], + }; + + assert_eq!(result, expected); + } + + /// DFA creation is unstable - it can result in many different isomorphic automaton structures. + /// This function relabels the states in a canonical way so that they can be compared for equality. + fn make_canonical(dfa: &mut Automaton) { + let mut translation = vec![0; dfa.states.len()]; + let mut stack = vec![1_u8]; + let mut i = 1_u8; + + while let Some(state) = stack.pop() { + if state == 0 || translation[state as usize] != 0 { + continue; + } + translation[state as usize] = i; + i += 1; + stack.push(dfa.states[state as usize].fallback_state.0); + + for trans in &dfa.states[state as usize].array_transitions { + stack.push(trans.target.0); + } + for (_, target) in &dfa.states[state as usize].member_transitions { + stack.push(target.0); + } + } + + let mut idx = 0_u8; + let mut current_placement = translation.clone(); + while (idx as usize) < translation.len() { + let c_idx = current_placement[idx as usize]; + if idx != c_idx { + dfa.states.swap(idx as usize, c_idx as usize); + current_placement.swap(idx as usize, c_idx as usize); + } else { + dfa.states[idx as usize].fallback_state.0 = + translation[dfa.states[idx as usize].fallback_state.0 as usize]; + for trans in &mut dfa.states[idx as usize].array_transitions { + trans.target.0 = translation[trans.target.0 as usize]; + } + for (_, target) in &mut dfa.states[idx as usize].member_transitions { + target.0 = translation[target.0 as usize]; + } + idx += 1; + } + } + } } diff --git a/crates/rsonpath-lib/src/automaton/nfa.rs b/crates/rsonpath-lib/src/automaton/nfa.rs index 39217869..e7489585 100644 --- a/crates/rsonpath-lib/src/automaton/nfa.rs +++ b/crates/rsonpath-lib/src/automaton/nfa.rs @@ -1,10 +1,10 @@ //! Definition of a nondeterministic automaton that can be directly //! obtained from a JsonPath query. This is then turned into //! a DFA with the minimizer. -use crate::error::UnsupportedFeatureError; +use crate::{automaton::SimpleSlice, error::UnsupportedFeatureError}; -use super::error::CompilerError; -use rsonpath_syntax::{num::JsonUInt, str::JsonString, JsonPathQuery}; +use super::{error::CompilerError, ArrayTransitionLabel}; +use rsonpath_syntax::{str::JsonString, JsonPathQuery, Step}; use std::{fmt::Display, ops::Index}; /// An NFA representing a query. It is always a directed path @@ -31,63 +31,14 @@ use NfaState::*; /// A transition in the NFA mapped from a [`JsonPathQuery`] selector. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(super) enum Transition<'q> { - /// A transition matching a specific member or array index. - Labelled(TransitionLabel<'q>), + /// A transition matching array indices. + Array(ArrayTransitionLabel), + /// A transition matching a specific member. + Member(&'q JsonString), /// A transition matching anything. Wildcard, } -/// Represent the distinct methods of moving on a match between states. -#[derive(Debug, Copy, PartialEq, Clone, Eq)] -pub(super) enum TransitionLabel<'q> { - /// Transition when a JSON member name matches a [`JsonString`]. - ObjectMember(&'q JsonString), - /// Transition on the n-th element of an array, with n specified by a [`JsonUInt`]. - ArrayIndex(JsonUInt), -} - -impl<'q> TransitionLabel<'q> { - /// Wraps a [`JsonString`] in a [`TransitionLabel`]. - #[must_use] - #[inline(always)] - pub(super) fn new_object_member(member_name: &'q JsonString) -> Self { - TransitionLabel::ObjectMember(member_name) - } - - /// Wraps a [`JsonUInt`] in a [`TransitionLabel`]. - #[must_use] - #[inline(always)] - pub(super) fn new_array_index(index: JsonUInt) -> Self { - TransitionLabel::ArrayIndex(index) - } -} - -impl<'q> From<&'q JsonString> for TransitionLabel<'q> { - #[must_use] - #[inline(always)] - fn from(member_name: &'q JsonString) -> Self { - TransitionLabel::new_object_member(member_name) - } -} - -impl Display for TransitionLabel<'_> { - #[inline(always)] - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - TransitionLabel::ObjectMember(name) => write!(f, "{}", name.unquoted()), - TransitionLabel::ArrayIndex(index) => write!(f, "{}", index.as_u64()), - } - } -} - -impl From for TransitionLabel<'_> { - #[must_use] - #[inline(always)] - fn from(index: JsonUInt) -> Self { - TransitionLabel::new_array_index(index) - } -} - /// State of an [`NondeterministicAutomaton`]. Thin wrapper over a state's /// identifier to distinguish NFA states from DFA states ([`State`](`super::state::State`)). #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] @@ -117,29 +68,47 @@ impl<'q> NondeterministicAutomaton<'q> { /// Returns a [`CompilerError::NotSupported`] if the query contains a construct /// not currently supported by rsonpath. pub(super) fn new(query: &'q JsonPathQuery) -> Result { - use rsonpath_syntax::{Index, Segment, Selector}; + use rsonpath_syntax::{Index, Selector}; let states_result: Result, CompilerError> = query .segments() .iter() - .map(|segment| match segment { - Segment::Child(selectors) if selectors.len() == 1 => match selectors.first() { - Selector::Name(name) => Ok(Direct(Transition::Labelled(name.into()))), - Selector::Wildcard => Ok(Direct(Transition::Wildcard)), - Selector::Index(Index::FromStart(index)) => Ok(Direct(Transition::Labelled((*index).into()))), - Selector::Index(Index::FromEnd(_)) => Err(UnsupportedFeatureError::indexing_from_end().into()), - Selector::Slice(_) => Err(UnsupportedFeatureError::slice_selector().into()), - Selector::Filter(_) => Err(UnsupportedFeatureError::filter_selector().into()), - }, - Segment::Descendant(selectors) if selectors.len() == 1 => match selectors.first() { - Selector::Name(name) => Ok(Recursive(Transition::Labelled(name.into()))), - Selector::Wildcard => Ok(Recursive(Transition::Wildcard)), - Selector::Index(Index::FromStart(index)) => Ok(Recursive(Transition::Labelled((*index).into()))), - Selector::Index(Index::FromEnd(_)) => Err(UnsupportedFeatureError::indexing_from_end().into()), - Selector::Slice(_) => Err(UnsupportedFeatureError::slice_selector().into()), - Selector::Filter(_) => Err(UnsupportedFeatureError::filter_selector().into()), - }, - _ => Err(UnsupportedFeatureError::multiple_selectors().into()), + .map(|segment| { + let selectors = segment.selectors(); + + if selectors.len() > 1 { + Err(UnsupportedFeatureError::multiple_selectors().into()) + } else { + let transition = match selectors.first() { + Selector::Name(name) => Ok::<_, CompilerError>(Transition::Member(name)), + Selector::Wildcard => Ok(Transition::Wildcard), + Selector::Index(Index::FromStart(index)) => Ok(Transition::Array((*index).into())), + Selector::Index(Index::FromEnd(_)) => Err(UnsupportedFeatureError::indexing_from_end().into()), + Selector::Slice(slice) => { + let start = match slice.start() { + Index::FromStart(idx) => Ok::<_, CompilerError>(idx), + Index::FromEnd(_) => Err(UnsupportedFeatureError::indexing_from_end().into()), + }?; + let end = match slice.end() { + Some(Index::FromStart(idx)) => Ok::<_, CompilerError>(Some(idx)), + Some(Index::FromEnd(_)) => Err(UnsupportedFeatureError::indexing_from_end().into()), + None => Ok(None), + }?; + let step = match slice.step() { + Step::Forward(step) => Ok::<_, CompilerError>(step), + Step::Backward(_) => Err(UnsupportedFeatureError::slice_with_backward_step().into()), + }?; + let simple_slice = SimpleSlice::new(start, end, step); + Ok(Transition::Array(simple_slice.into())) + } + Selector::Filter(_) => Err(UnsupportedFeatureError::filter_selector().into()), + }?; + if segment.is_child() { + Ok(Direct(transition)) + } else { + Ok(Recursive(transition)) + } + } }) .collect(); let mut states = states_result?; @@ -182,25 +151,38 @@ impl<'q> Display for NondeterministicAutomaton<'q> { .ordered_states .iter() .filter_map(|s| match s { - Direct(Transition::Labelled(label)) | Recursive(Transition::Labelled(label)) => Some(*label), + Direct(Transition::Member(label)) | Recursive(Transition::Member(label)) => { + Some(label.unquoted().to_string()) + } + Direct(Transition::Array(label)) | Recursive(Transition::Array(label)) => Some(label.to_string()), _ => None, }) .collect(); for (i, state) in self.ordered_states.iter().enumerate() { match state { - Direct(Transition::Labelled(label)) => { + Direct(Transition::Array(label)) => { writeln!(f, "s{i}.{} -> s{};", label, i + 1)?; } + Direct(Transition::Member(label)) => { + writeln!(f, "s{i}.{} -> s{};", label.unquoted(), i + 1)?; + } Direct(Transition::Wildcard) => { for label in &all_labels { writeln!(f, "s{i}.{} -> s{};", label, i + 1)?; } writeln!(f, "s{i}.X -> s{};", i + 1)?; } - Recursive(Transition::Labelled(label)) => { + Recursive(Transition::Member(label)) => { + writeln!(f, "s{i}.{} -> s{i}, s{};", label.unquoted(), i + 1)?; + for label in all_labels.iter().filter(|&l| l != label.unquoted()) { + writeln!(f, "s{i}.{} -> s{i};", label)?; + } + writeln!(f, "s{i}.X -> s{i};")?; + } + Recursive(Transition::Array(label)) => { writeln!(f, "s{i}.{} -> s{i}, s{};", label, i + 1)?; - for label in all_labels.iter().filter(|&l| l != label) { + for label in all_labels.iter().filter(|&l| l != &label.to_string()) { writeln!(f, "s{i}.{} -> s{i};", label)?; } writeln!(f, "s{i}.X -> s{i};")?; @@ -243,10 +225,10 @@ mod tests { let expected_automaton = NondeterministicAutomaton { ordered_states: vec![ - NfaState::Direct(Transition::Labelled((&label_a).into())), - NfaState::Direct(Transition::Labelled((&label_b).into())), - NfaState::Recursive(Transition::Labelled((&label_c).into())), - NfaState::Recursive(Transition::Labelled((&label_d).into())), + NfaState::Direct(Transition::Member(&label_a)), + NfaState::Direct(Transition::Member(&label_b)), + NfaState::Recursive(Transition::Member(&label_c)), + NfaState::Recursive(Transition::Member(&label_d)), NfaState::Direct(Transition::Wildcard), NfaState::Direct(Transition::Wildcard), NfaState::Recursive(Transition::Wildcard), diff --git a/crates/rsonpath-lib/src/automaton/small_set.rs b/crates/rsonpath-lib/src/automaton/small_set.rs index 11f0775e..4950351e 100644 --- a/crates/rsonpath-lib/src/automaton/small_set.rs +++ b/crates/rsonpath-lib/src/automaton/small_set.rs @@ -15,6 +15,9 @@ pub(crate) trait SmallSet: IntoIterator { /// Modify the set to include `elem`. fn insert(&mut self, elem: T); + /// Modify the set to include all elements from `other`. + fn union(&mut self, other: &Self); + /// Returns whether the given `elem` is a member of the set. fn contains(&self, elem: T) -> bool; @@ -40,11 +43,6 @@ pub(crate) struct SmallSet256 { half_2: SmallSet128, } -#[derive(Default, Clone, Copy, PartialEq, Eq)] -struct SmallSet128 { - bitmask: u128, -} - impl SmallSet for SmallSet256 { fn len(&self) -> usize { self.half_1.len() + self.half_2.len() @@ -62,6 +60,11 @@ impl SmallSet for SmallSet256 { } } + fn union(&mut self, other: &Self) { + self.half_1.union(&other.half_1); + self.half_2.union(&other.half_2); + } + fn contains(&self, elem: u8) -> bool { if elem < 128 { self.half_1.contains(elem) @@ -102,55 +105,95 @@ impl SmallSet for SmallSet256 { } } -impl SmallSet for SmallSet128 { - fn len(&self) -> usize { - self.bitmask.count_ones() as usize - } +macro_rules! native_small_set { + ($name:ident, $iter:ident, $mask:ty, $size:literal) => { + #[derive(Default, Clone, Copy, PartialEq, Eq)] + struct $name { + bitmask: $mask, + } - fn is_empty(&self) -> bool { - self.bitmask == 0 - } + struct $iter { + bitmask: $mask, + } - fn insert(&mut self, elem: u8) { - self.bitmask |= 1 << elem; - } + impl SmallSet for $name { + fn len(&self) -> usize { + self.bitmask.count_ones() as usize + } - fn contains(&self, elem: u8) -> bool { - (self.bitmask & (1 << elem)) != 0 - } + fn is_empty(&self) -> bool { + self.bitmask == 0 + } - fn iter(&self) -> SmallSet128Iter { - SmallSet128Iter { bitmask: self.bitmask } - } + fn insert(&mut self, elem: u8) { + self.bitmask |= 1 << elem; + } - fn singleton(&self) -> Option { - let elem = self.bitmask.trailing_zeros(); - let elem_mask = 1_u128.wrapping_shl(elem); - let remainder = self.bitmask ^ elem_mask; + fn union(&mut self, other: &Self) { + self.bitmask |= other.bitmask; + } - // CAST: trivially safe as bitmask can have at most 128 zeroes. - (remainder == 0).then_some(elem as u8) - } + fn contains(&self, elem: u8) -> bool { + (self.bitmask & (1 << elem)) != 0 + } - fn clear(&mut self) { - self.bitmask = 0; - } + fn iter(&self) -> $iter { + $iter { + bitmask: self.bitmask, + } + } - fn remove_all_before(&mut self, cutoff: u8) { - let mask: u128 = 0xFFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF << cutoff; - self.bitmask &= mask; - } -} + fn singleton(&self) -> Option { + let elem = self.bitmask.trailing_zeros(); + let elem_mask = (1 as $mask).wrapping_shl(elem); + let remainder = self.bitmask ^ elem_mask; -impl IntoIterator for SmallSet128 { - type Item = u8; - type IntoIter = SmallSet128Iter; + // CAST: trivially safe as bitmask can have at most 128 zeroes. + (remainder == 0).then_some(elem as u8) + } - fn into_iter(self) -> Self::IntoIter { - self.iter() - } + fn clear(&mut self) { + self.bitmask = 0; + } + + fn remove_all_before(&mut self, cutoff: u8) { + let mask: $mask = <$mask>::MAX << cutoff; + self.bitmask &= mask; + } + } + + impl IntoIterator for $name { + type Item = u8; + type IntoIter = $iter; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } + } + + impl Iterator for $iter { + type Item = u8; + + fn next(&mut self) -> Option { + let next_elem = self.bitmask.trailing_zeros(); + + if next_elem == $size { + return None; + } + + let elem_mask = 1 << next_elem; + self.bitmask ^= elem_mask; + + // CAST: trivially safe as bitmask can have at most 128 zeroes. + Some(next_elem as u8) + } + } + }; } +native_small_set!(SmallSet64, SmallSet64Iter, u64, 64); +native_small_set!(SmallSet128, SmallSet128Iter, u128, 128); + impl From<[u8; N]> for SmallSet256 { fn from(arr: [u8; N]) -> Self { Self::from_iter(arr) @@ -215,28 +258,6 @@ impl Iterator for SmallSet256Iter { } } -struct SmallSet128Iter { - bitmask: u128, -} - -impl Iterator for SmallSet128Iter { - type Item = u8; - - fn next(&mut self) -> Option { - let next_elem = self.bitmask.trailing_zeros(); - - if next_elem == 128 { - return None; - } - - let elem_mask = 1 << next_elem; - self.bitmask ^= elem_mask; - - // CAST: trivially safe as bitmask can have at most 128 zeroes. - Some(next_elem as u8) - } -} - #[cfg(test)] mod tests256 { use super::*; diff --git a/crates/rsonpath-lib/src/automaton/state.rs b/crates/rsonpath-lib/src/automaton/state.rs index 5a85a807..78925f5b 100644 --- a/crates/rsonpath-lib/src/automaton/state.rs +++ b/crates/rsonpath-lib/src/automaton/state.rs @@ -19,10 +19,10 @@ pub(crate) enum StateAttribute { /// (labelled or fallback) to an [`Accepting`](`StateAttribute::Accepting`) state. TransitionsToAccepting = 0x08, /// Marks that the [`State`] contains some transition labelled with an array index. - HasArrayIndexTransition = 0x10, + HasArrayTransition = 0x10, /// Marks that the [`State`] contains an array-index labelled transition /// to an to an [`Accepting`](`StateAttribute::Accepting`) state. - HasArrayIndexTransitionToAccepting = 0x20, + HasArrayTransitionToAccepting = 0x20, } pub(crate) struct StateAttributesBuilder { @@ -56,12 +56,12 @@ impl StateAttributesBuilder { self.set(StateAttribute::TransitionsToAccepting) } - pub(crate) fn has_array_index_transition(self) -> Self { - self.set(StateAttribute::HasArrayIndexTransition) + pub(crate) fn has_array_transition(self) -> Self { + self.set(StateAttribute::HasArrayTransition) } - pub(crate) fn has_array_index_transition_to_accepting(self) -> Self { - self.set(StateAttribute::HasArrayIndexTransitionToAccepting) + pub(crate) fn has_array_transition_to_accepting(self) -> Self { + self.set(StateAttribute::HasArrayTransitionToAccepting) } pub(crate) fn build(self) -> StateAttributes { @@ -106,12 +106,11 @@ impl StateAttributes { /// A state is _unitary_ if it contains exactly one labelled transition /// and its fallback transition is [`Rejecting`](`StateAttributes::is_rejecting`). pub const UNITARY: Self = Self(StateAttribute::Unitary as u8); - /// Marks that the [`State`] contains some transition labelled with an array index. - pub const HAS_ARRAY_INDEX_TRANSITION: Self = Self(StateAttribute::HasArrayIndexTransition as u8); - /// Marks that the [`State`] contains an array-index labelled transition + /// Marks that the [`State`] contains some transition labelled with an array index or slice. + pub const HAS_ARRAY_TRANSITION: Self = Self(StateAttribute::HasArrayTransition as u8); + /// Marks that the [`State`] contains an array index- or slice-labelled transition /// to an to an [`Accepting`](`StateAttributes::is_accepting`) state. - pub const HAS_ARRAY_INDEX_TRANSITION_TO_ACCEPTING: Self = - Self(StateAttribute::HasArrayIndexTransitionToAccepting as u8); + pub const HAS_ARRAY_TRANSITION_TO_ACCEPTING: Self = Self(StateAttribute::HasArrayTransitionToAccepting as u8); /// Check if the the state is accepting. #[inline(always)] @@ -145,19 +144,19 @@ impl StateAttributes { self.is_set(StateAttribute::Unitary) } - /// Marks that the [`State`] contains some transition labelled with an array index. + /// Marks that the [`State`] contains some transition labelled with an array index or slice. #[inline(always)] #[must_use] - pub fn has_array_index_transition(&self) -> bool { - self.is_set(StateAttribute::HasArrayIndexTransition) + pub fn has_array_transition(&self) -> bool { + self.is_set(StateAttribute::HasArrayTransition) } - /// Marks that the [`State`] contains an array-index labelled transition + /// Marks that the [`State`] contains an array index- or slice- labelled transition /// to an to an [`Accepting`](`StateAttributes::is_accepting`) state. #[inline(always)] #[must_use] - pub fn has_array_index_transition_to_accepting(&self) -> bool { - self.is_set(StateAttribute::HasArrayIndexTransitionToAccepting) + pub fn has_array_transition_to_accepting(&self) -> bool { + self.is_set(StateAttribute::HasArrayTransitionToAccepting) } #[inline(always)] diff --git a/crates/rsonpath-lib/src/engine/main.rs b/crates/rsonpath-lib/src/engine/main.rs index ddcd3559..f5bc9995 100644 --- a/crates/rsonpath-lib/src/engine/main.rs +++ b/crates/rsonpath-lib/src/engine/main.rs @@ -6,10 +6,7 @@ //! the JSON structure, which allows efficient SIMD operations and optimized register usage. #![allow(clippy::type_complexity)] // The private Classifier type is very complex, but we specifically macro it out. use crate::{ - automaton::{ - error::CompilerError, - {Automaton, State}, - }, + automaton::{error::CompilerError, Automaton, State}, classification::{ simd::{self, config_simd, dispatch_simd, Simd, SimdConfiguration}, structural::{BracketType, Structural, StructuralIterator}, @@ -377,20 +374,21 @@ where debug!("Opening {bracket_type:?}, increasing depth and pushing stack.",); let mut any_matched = false; - let colon_idx = self.find_preceding_colon(idx); - - 'trans: { - for &(i, target) in self.automaton[self.state].array_transitions() { - if self.is_list && i.eq(&self.array_count) { + if self.is_list { + for trans in self.automaton[self.state].array_transitions() { + if trans.matches(self.array_count) { + let target = trans.target_state(); any_matched = true; self.transition_to(target, bracket_type); if self.automaton.is_accepting(target) { debug!("Accept {idx}"); self.record_match_detected_at(idx, NodeTypeHint::Complex(bracket_type))?; } - break 'trans; + break; } } + } else { + let colon_idx = self.find_preceding_colon(idx); for &(member_name, target) in self.automaton[self.state].member_transitions() { if let Some(colon_idx) = colon_idx { @@ -400,7 +398,7 @@ where if self.automaton.is_accepting(target) { self.record_match_detected_at(colon_idx + 1, NodeTypeHint::Complex(bracket_type))?; } - break 'trans; + break; } } } diff --git a/crates/rsonpath-lib/src/error.rs b/crates/rsonpath-lib/src/error.rs index ed764cbf..f154c5c3 100644 --- a/crates/rsonpath-lib/src/error.rs +++ b/crates/rsonpath-lib/src/error.rs @@ -154,6 +154,14 @@ impl UnsupportedFeatureError { Self::tracked(152, "Slice Selector") } + /// Slice with Backward Step – supporting slice selectors that step backwards. + /// Unsupported and not planned (yet). + #[must_use] + #[inline(always)] + pub fn slice_with_backward_step() -> Self { + Self::untracked("Slice with Backward Step") + } + /// Filter Selector – supporting filter selectors. /// #[must_use] diff --git a/crates/rsonpath-syntax/src/lib.rs b/crates/rsonpath-syntax/src/lib.rs index c9d390ed..aa894fb3 100644 --- a/crates/rsonpath-syntax/src/lib.rs +++ b/crates/rsonpath-syntax/src/lib.rs @@ -1071,6 +1071,36 @@ impl Index { } } +impl Step { + /// Check if this is a step going forward in an array. + /// + /// # Examples + /// ``` + /// # use rsonpath_syntax::{Selector, Step}; + /// let step = Step::Forward(2.try_into().unwrap()); + /// assert!(step.is_forward()); + /// ``` + #[inline(always)] + #[must_use] + pub const fn is_forward(&self) -> bool { + matches!(self, Self::Forward(_)) + } + + /// Check if this is a step going backward in an array. + /// + /// # Examples + /// ``` + /// # use rsonpath_syntax::{Selector, Step}; + /// let step = Step::Backward(2.try_into().unwrap()); + /// assert!(step.is_backward()); + /// ``` + #[inline(always)] + #[must_use] + pub const fn is_backward(&self) -> bool { + matches!(self, Self::Backward(_)) + } +} + impl Deref for Selectors { type Target = [Selector]; diff --git a/crates/rsonpath-syntax/src/num.rs b/crates/rsonpath-syntax/src/num.rs index 2cbb48cb..5e4e7378 100644 --- a/crates/rsonpath-syntax/src/num.rs +++ b/crates/rsonpath-syntax/src/num.rs @@ -721,6 +721,14 @@ impl From for u64 { } } +impl From for i64 { + #[inline(always)] + fn from(value: JsonUInt) -> Self { + // Safe cast since JsonUInt::MAX is lower than i64::MAX. + value.0 as Self + } +} + impl TryFrom for JsonUInt { type Error = JsonIntOverflowError; diff --git a/crates/rsonpath-test/documents/toml/recursive_index_and_slice.toml b/crates/rsonpath-test/documents/toml/recursive_index_and_slice.toml new file mode 100644 index 00000000..17e50ba7 --- /dev/null +++ b/crates/rsonpath-test/documents/toml/recursive_index_and_slice.toml @@ -0,0 +1,66 @@ +# Define the JSON input for all query test cases. +[input] +# Short description of the input structure. +description = "Nested arrays with integers." + # Set to true only if your specific test input is fully compressed (no extraneous whitespace). +is_compressed = false + +# Inline JSON document. +[input.source] +json_string = ''' +[ + 0, + 1, + 2, + [ + 10, + 11, + 12, + [13] + ], + [ + 20, + 21, + 22, + [23] + ], +] +''' + +# Define queries to test on the input. +[[queries]] + # Valid JSONPath query string. +query = "$..[3::][3]" +# Short descritpion of the query semantics. +description = "select slice starting at 3 and then 3" + +[queries.results] +# Number of expected matches. +count = 2 +# Byte locations of spans of all matches, in order. +spans = [[49, 53], [91, 95]] +# Stringified values of all matches, verbatim as in the input, +# in the same order as above. +nodes = [ + '[13]', + '[23]' +] + +# Define queries to test on the input. +[[queries]] + # Valid JSONPath query string. +query = "$..[3::][3][0]" +# Short descritpion of the query semantics. +description = "select slice starting at 3 and then 3 and then 0" + +[queries.results] +# Number of expected matches. +count = 2 +# Byte locations of spans of all matches, in order. +spans = [[50, 52], [92, 94]] +# Stringified values of all matches, verbatim as in the input, +# in the same order as above. +nodes = [ + '13', + '23' +] \ No newline at end of file diff --git a/crates/rsonpath-test/documents/toml/recursive_slice_overlaps.toml b/crates/rsonpath-test/documents/toml/recursive_slice_overlaps.toml new file mode 100644 index 00000000..7c76c98d --- /dev/null +++ b/crates/rsonpath-test/documents/toml/recursive_slice_overlaps.toml @@ -0,0 +1,171 @@ +# Define the JSON input for all query test cases. +[input] +# Short description of the input structure. +description = "Nested arrays with integers." +# Set to true only if your specific test input is fully compressed (no extraneous whitespace). +is_compressed = false + +# Inline JSON document. +[input.source] +json_string = ''' +[ + [[1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019], + [1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1118,1119], + [1200,1201,1202,1203,1204,1205,1206,1207,1208,1209,1210,1211,1212,1213,1214,1215,1216,1217,1218,1219], + [1300,1301,1302,1303,1304,1305,1306,1307,1308,1309,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319], + [1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1416,1417,1418,1419], + [1500,1501,1502,1503,1504,1505,1506,1507,1508,1509,1510,1511,1512,1513,1514,1515,1516,1517,1518,1519], + [1600,1601,1602,1603,1604,1605,1606,1607,1608,1609,1610,1611,1612,1613,1614,1615,1616,1617,1618,1619], + [1700,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710,1711,1712,1713,1714,1715,1716,1717,1718,1719], + [1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815,1816,1817,1818,1819], + [1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919]], + [[2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019], + [2100,2101,2102,2103,2104,2105,2106,2107,2108,2109,2110,2111,2112,2113,2114,2115,2116,2117,2118,2119], + [2200,2201,2202,2203,2204,2205,2206,2207,2208,2209,2210,2211,2212,2213,2214,2215,2216,2217,2218,2219], + [2300,2301,2302,2303,2304,2305,2306,2307,2308,2309,2310,2311,2312,2313,2314,2315,2316,2317,2318,2319], + [2400,2401,2402,2403,2404,2405,2406,2407,2408,2409,2410,2411,2412,2413,2414,2415,2416,2417,2418,2419], + [2500,2501,2502,2503,2504,2505,2506,2507,2508,2509,2510,2511,2512,2513,2514,2515,2516,2517,2518,2519], + [2600,2601,2602,2603,2604,2605,2606,2607,2608,2609,2610,2611,2612,2613,2614,2615,2616,2617,2618,2619], + [2700,2701,2702,2703,2704,2705,2706,2707,2708,2709,2710,2711,2712,2713,2714,2715,2716,2717,2718,2719], + [2800,2801,2802,2803,2804,2805,2806,2807,2808,2809,2810,2811,2812,2813,2814,2815,2816,2817,2818,2819], + [2900,2901,2902,2903,2904,2905,2906,2907,2908,2909,2910,2911,2912,2913,2914,2915,2916,2917,2918,2919]], + [[3000,3001,3002,3003,3004,3005,3006,3007,3008,3009,3010,3011,3012,3013,3014,3015,3016,3017,3018,3019], + [3100,3101,3102,3103,3104,3105,3106,3107,3108,3109,3110,3111,3112,3113,3114,3115,3116,3117,3118,3119], + [3200,3201,3202,3203,3204,3205,3206,3207,3208,3209,3210,3211,3212,3213,3214,3215,3216,3217,3218,3219], + [3300,3301,3302,3303,3304,3305,3306,3307,3308,3309,3310,3311,3312,3313,3314,3315,3316,3317,3318,3319], + [3400,3401,3402,3403,3404,3405,3406,3407,3408,3409,3410,3411,3412,3413,3414,3415,3416,3417,3418,3419], + [3500,3501,3502,3503,3504,3505,3506,3507,3508,3509,3510,3511,3512,3513,3514,3515,3516,3517,3518,3519], + [3600,3601,3602,3603,3604,3605,3606,3607,3608,3609,3610,3611,3612,3613,3614,3615,3616,3617,3618,3619], + [3700,3701,3702,3703,3704,3705,3706,3707,3708,3709,3710,3711,3712,3713,3714,3715,3716,3717,3718,3719], + [3800,3801,3802,3803,3804,3805,3806,3807,3808,3809,3810,3811,3812,3813,3814,3815,3816,3817,3818,3819], + [3900,3901,3902,3903,3904,3905,3906,3907,3908,3909,3910,3911,3912,3913,3914,3915,3916,3917,3918,3919]], + [[4000,4001,4002,4003,4004,4005,4006,4007,4008,4009,4010,4011,4012,4013,4014,4015,4016,4017,4018,4019], + [4100,4101,4102,4103,4104,4105,4106,4107,4108,4109,4110,4111,4112,4113,4114,4115,4116,4117,4118,4119], + [4200,4201,4202,4203,4204,4205,4206,4207,4208,4209,4210,4211,4212,4213,4214,4215,4216,4217,4218,4219], + [4300,4301,4302,4303,4304,4305,4306,4307,4308,4309,4310,4311,4312,4313,4314,4315,4316,4317,4318,4319], + [4400,4401,4402,4403,4404,4405,4406,4407,4408,4409,4410,4411,4412,4413,4414,4415,4416,4417,4418,4419], + [4500,4501,4502,4503,4504,4505,4506,4507,4508,4509,4510,4511,4512,4513,4514,4515,4516,4517,4518,4519], + [4600,4601,4602,4603,4604,4605,4606,4607,4608,4609,4610,4611,4612,4613,4614,4615,4616,4617,4618,4619], + [4700,4701,4702,4703,4704,4705,4706,4707,4708,4709,4710,4711,4712,4713,4714,4715,4716,4717,4718,4719], + [4800,4801,4802,4803,4804,4805,4806,4807,4808,4809,4810,4811,4812,4813,4814,4815,4816,4817,4818,4819], + [4900,4901,4902,4903,4904,4905,4906,4907,4908,4909,4910,4911,4912,4913,4914,4915,4916,4917,4918,4919]], + [[5000,5001,5002,5003,5004,5005,5006,5007,5008,5009,5010,5011,5012,5013,5014,5015,5016,5017,5018,5019], + [5100,5101,5102,5103,5104,5105,5106,5107,5108,5109,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119], + [5200,5201,5202,5203,5204,5205,5206,5207,5208,5209,5210,5211,5212,5213,5214,5215,5216,5217,5218,5219], + [5300,5301,5302,5303,5304,5305,5306,5307,5308,5309,5310,5311,5312,5313,5314,5315,5316,5317,5318,5319], + [5400,5401,5402,5403,5404,5405,5406,5407,5408,5409,5410,5411,5412,5413,5414,5415,5416,5417,5418,5419], + [5500,5501,5502,5503,5504,5505,5506,5507,5508,5509,5510,5511,5512,5513,5514,5515,5516,5517,5518,5519], + [5600,5601,5602,5603,5604,5605,5606,5607,5608,5609,5610,5611,5612,5613,5614,5615,5616,5617,5618,5619], + [5700,5701,5702,5703,5704,5705,5706,5707,5708,5709,5710,5711,5712,5713,5714,5715,5716,5717,5718,5719], + [5800,5801,5802,5803,5804,5805,5806,5807,5808,5809,5810,5811,5812,5813,5814,5815,5816,5817,5818,5819], + [5900,5901,5902,5903,5904,5905,5906,5907,5908,5909,5910,5911,5912,5913,5914,5915,5916,5917,5918,5919]], + [[6000,6001,6002,6003,6004,6005,6006,6007,6008,6009,6010,6011,6012,6013,6014,6015,6016,6017,6018,6019], + [6100,6101,6102,6103,6104,6105,6106,6107,6108,6109,6110,6111,6112,6113,6114,6115,6116,6117,6118,6119], + [6200,6201,6202,6203,6204,6205,6206,6207,6208,6209,6210,6211,6212,6213,6214,6215,6216,6217,6218,6219], + [6300,6301,6302,6303,6304,6305,6306,6307,6308,6309,6310,6311,6312,6313,6314,6315,6316,6317,6318,6319], + [6400,6401,6402,6403,6404,6405,6406,6407,6408,6409,6410,6411,6412,6413,6414,6415,6416,6417,6418,6419], + [6500,6501,6502,6503,6504,6505,6506,6507,6508,6509,6510,6511,6512,6513,6514,6515,6516,6517,6518,6519], + [6600,6601,6602,6603,6604,6605,6606,6607,6608,6609,6610,6611,6612,6613,6614,6615,6616,6617,6618,6619], + [6700,6701,6702,6703,6704,6705,6706,6707,6708,6709,6710,6711,6712,6713,6714,6715,6716,6717,6718,6719], + [6800,6801,6802,6803,6804,6805,6806,6807,6808,6809,6810,6811,6812,6813,6814,6815,6816,6817,6818,6819], + [6900,6901,6902,6903,6904,6905,6906,6907,6908,6909,6910,6911,6912,6913,6914,6915,6916,6917,6918,6919]] +] +''' + +# Define queries to test on the input. +[[queries]] +# Valid JSONPath query string. +query = "$..[3::2][4::3][::5]" +# Short descritpion of the query semantics. +description = "three consecutive overlapping slices" + +[queries.results] +# Number of expected matches. +count = 16 +# Byte locations of spans of all matches, in order. +spans = [ + [ + 3652, + 3656, + ], + [ + 3677, + 3681, + ], + [ + 3702, + 3706, + ], + [ + 3727, + 3731, + ], + [ + 3973, + 3977, + ], + [ + 3998, + 4002, + ], + [ + 4023, + 4027, + ], + [ + 4048, + 4052, + ], + [ + 5796, + 5800, + ], + [ + 5821, + 5825, + ], + [ + 5846, + 5850, + ], + [ + 5871, + 5875, + ], + [ + 6117, + 6121, + ], + [ + 6142, + 6146, + ], + [ + 6167, + 6171, + ], + [ + 6192, + 6196, + ], +] +# Stringified values of all matches, verbatim as in the input, +# in the same order as above. +nodes = [ + '4400', + '4405', + '4410', + '4415', + '4700', + '4705', + '4710', + '4715', + '6400', + '6405', + '6410', + '6415', + '6700', + '6705', + '6710', + '6715', +] diff --git a/crates/rsonpath/src/main.rs b/crates/rsonpath/src/main.rs index 4ab23d1d..6f7fe5b6 100644 --- a/crates/rsonpath/src/main.rs +++ b/crates/rsonpath/src/main.rs @@ -35,6 +35,7 @@ fn run_with_args(args: &Args) -> Result<()> { if args.compile { // Only compilation was requested, so we print the automaton and exit. println!("{automaton}"); + debug!("{automaton:?}"); Ok(()) } else { // Actual query execution. diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 4cc25230..861a68f8 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -301,18 +301,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", diff --git a/rsonpath.code-workspace b/rsonpath.code-workspace index 39870293..7928a0d9 100644 --- a/rsonpath.code-workspace +++ b/rsonpath.code-workspace @@ -16,10 +16,12 @@ "cmpeq", "codegen", "color_eyre", + "complementation", "cvtsi", "datavalue", "dealloc", "Deque", + "determinization", "docsrs", "ebnf", "endianness",