From 27479ca20414d3f22e57ac4c15c338ca18b25b9b Mon Sep 17 00:00:00 2001 From: Easyoakland <97992568+Easyoakland@users.noreply.github.com> Date: Mon, 20 Nov 2023 22:27:42 -0700 Subject: [PATCH] feat: `StrStreamTokens` Handle `Tokens::parse()` more efficiently for Iterator --- README.md | 7 +- src/lib.rs | 15 +- src/stream_tokens.rs | 167 +++++++++-------- src/stream_tokens/str_stream_tokens.rs | 246 +++++++++++++++++++++++++ 4 files changed, 355 insertions(+), 80 deletions(-) create mode 100644 src/stream_tokens/str_stream_tokens.rs diff --git a/README.md b/README.md index be586d2..3180c43 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ use yap_streaming::{ // to get an instance of the above: IntoTokens, // Allows you to get an instance of `Tokens` that supports streams: - StreamTokens, + StrStreamTokens, // This trait has all of the parsing methods on it: Tokens, }; @@ -138,8 +138,9 @@ let file_chars = BufReader::new(File::open("examples/opOrDigit.txt").expect("ope } } }); -// Convert to something implementing `Tokens` -let mut tokens = StreamTokens::new(file_chars); +// Convert to something implementing `Tokens`. +// If parsing a stream not of `char` use [`yap_streaming::StreamTokens`] instead. +let mut tokens = StrStreamTokens::new(file_chars); // Parse assert_eq!(eval(&mut tokens), 140); // Check that parse encountered no io errors. diff --git a/src/lib.rs b/src/lib.rs index ebf46c4..9a204b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,7 +31,7 @@ use yap_streaming::{ // to get an instance of the above: IntoTokens, // Allows you to get an instance of `Tokens` that supports streams: - StreamTokens, + StrStreamTokens, // This trait has all of the parsing methods on it: Tokens, }; @@ -138,8 +138,9 @@ let file_chars = BufReader::new(File::open("examples/opOrDigit.txt").expect("ope } } }); -// Convert to something implementing `Tokens` -let mut tokens = StreamTokens::new(file_chars); +// Convert to something implementing `Tokens`. +// If parsing a stream not of `char` use [`yap_streaming::StreamTokens`] instead. +let mut tokens = StrStreamTokens::new(file_chars); // Parse assert_eq!(eval(&mut tokens), 140); // Check that parse encountered no io errors. @@ -147,7 +148,11 @@ assert!(io_err.is_none()); # } ``` */ -#![deny(missing_docs)] +#![deny( + missing_copy_implementations, + missing_debug_implementations, + missing_docs +)] #![cfg_attr(not(feature = "std"), no_std)] #[cfg(feature = "alloc")] extern crate alloc; @@ -155,5 +160,5 @@ extern crate alloc; #[cfg(feature = "alloc")] mod stream_tokens; #[cfg(feature = "alloc")] -pub use stream_tokens::{StreamTokens, StreamTokensLocation}; +pub use stream_tokens::{str_stream_tokens::StrStreamTokens, StreamTokens, StreamTokensLocation}; pub use yap::{IntoTokens, TokenLocation, Tokens}; diff --git a/src/stream_tokens.rs b/src/stream_tokens.rs index ecc1e20..67bc5fe 100644 --- a/src/stream_tokens.rs +++ b/src/stream_tokens.rs @@ -1,36 +1,64 @@ use alloc::{collections::VecDeque, rc::Rc, vec::Vec}; -use core::{cell::RefCell, fmt::Debug, iter::Iterator}; +use core::{ + cell::RefCell, + fmt::Debug, + iter::{Fuse, Iterator}, +}; use yap::{IntoTokens, TokenLocation, Tokens}; -/// Buffer over items of an iterator. -#[derive(Clone, Debug, PartialEq, Eq)] -struct Buffer { - oldest_elem_id: usize, - elements: VecDeque>, +pub(crate) mod str_stream_tokens; + +/// Helper trait for defining buffers that can be used to store items in [`StreamTokens`] for [`Token::set_location()`] resets +pub trait StreamTokensBuffer: Default { + /// Remove n items from the front of the buffer. If buffer has less than `n` elements clear the buffer. + fn drain_front(&mut self, n: usize); + /// Add a new item to the back of the buffer. + fn push(&mut self, item: Item); + /// Get the item at the given `idx` if it exists. + fn get(&self, idx: usize) -> Option; } -// Manual impl because Item: !Default also works. -impl Default for Buffer { - fn default() -> Self { - Self { - oldest_elem_id: Default::default(), - elements: Default::default(), +impl StreamTokensBuffer for VecDeque { + fn drain_front(&mut self, n: usize) { + if n > self.len() { + self.clear() + } else { + // TODO test this vs self.drain(..n) performance + for _ in 0..n { + self.pop_front(); + } } } + + fn push(&mut self, item: Item) { + self.push_back(item) + } + + fn get(&self, idx: usize) -> Option { + self.get(idx).cloned() + } +} + +/// Buffer over items of an iterator. +#[derive(Clone, Default, Debug, PartialEq, Eq)] +struct Buffer { + oldest_elem_cursor: usize, + elements: Buf, } -/// Enables parsing a stream of values from an iterator that can't itself be cloned. -/// In order to be able to rewind the iterator it must save values since the oldest not [`Drop`]ed [`StreamTokensLocation`] +/// Enables parsing a stream of values from a [`Fuse`]d iterator that can't itself be cloned. +/// In order to be able to rewind the iterator it must save values since the oldest not [`Drop`]ed [`StreamTokensLocation`] into `Buf`. +/// +/// See [`Self::new`] for example usage. #[derive(Debug)] -pub struct StreamTokens +pub struct StreamTokens where I: Iterator, { - iter: I, + iter: Fuse, cursor: usize, - /// Buffer of items and the id of the oldest item in the buffer. - buffer: Buffer, - /// Sorted list of the oldest items needed per location + buffer: Buffer, + /// Sorted list of the oldest items needed per live location checkout: Rc>>, } @@ -87,14 +115,26 @@ impl TokenLocation for StreamTokensLocation { } } -impl StreamTokens +impl StreamTokens { + /// Generic new function allowing arbitrary buffer. + /// Exists because type inference is not smart enough to try the default generic when calling [`Self::new`] so `new` hardcodes the default. + /// See + pub(crate) fn _new(iter: I) -> Self { + StreamTokens { + // Store a fused iterator so the buffer can safely be of `Item` instead of `Option` + iter: iter.fuse(), + cursor: Default::default(), + buffer: Default::default(), + checkout: Default::default(), + } + } +} + +impl StreamTokens> where I::Item: Clone, { - /// We can't define a blanket impl for [`IntoTokens`] on all `impl Iterator` without - /// [specialization](https://rust-lang.github.io/rfcs/1210-impl-specialization.html). - /// - /// Instead, use this method to convert a suitable iterator into [`Tokens`]. + /// Use this method to convert a suitable iterator into [`Tokens`]. /// /// # Example /// @@ -113,19 +153,15 @@ where /// assert!(tokens.tokens("world".chars())); /// ``` pub fn new(iter: I) -> Self { - StreamTokens { - iter, - cursor: Default::default(), - buffer: Default::default(), - checkout: Default::default(), - } + Self::_new(iter) } } -impl Tokens for StreamTokens +impl Tokens for StreamTokens where I: Iterator, - I::Item: Clone + Debug, + I::Item: Clone, + Buffer: StreamTokensBuffer, { type Item = I::Item; @@ -140,10 +176,9 @@ where if let Some(val) = self .buffer .elements - .get(self.cursor - 1 - self.buffer.oldest_elem_id) - .cloned() + .get(self.cursor - 1 - self.buffer.oldest_elem_cursor) { - return val; + return Some(val); } } @@ -155,21 +190,20 @@ where Some(&x) => x.min(self.cursor), None => self.cursor, }; - while (self.buffer.oldest_elem_id < min) && (!self.buffer.elements.is_empty()) { - self.buffer.elements.pop_front(); - self.buffer.oldest_elem_id += 1; - } + let delta = min - self.buffer.oldest_elem_cursor; + self.buffer.elements.drain_front(delta); + self.buffer.oldest_elem_cursor = min; } // Handle cache miss { - let next = self.iter.next(); + let next = self.iter.next()?; // Don't save to buffer if no locations exist which might need the value again if checkout.is_empty() { - next + Some(next) } else { - self.buffer.elements.push_back(next.clone()); - next + self.buffer.elements.push(next.clone()); + Some(next) } } } @@ -197,10 +231,11 @@ where } } -impl IntoTokens for StreamTokens +impl IntoTokens for StreamTokens where I: Iterator, I::Item: Clone + core::fmt::Debug, + Buf: StreamTokensBuffer, { type Tokens = Self; fn into_tokens(self) -> Self { @@ -213,7 +248,6 @@ mod tests { use super::*; #[test] - #[cfg(feature = "alloc")] fn stream_tokens_sanity_check() { // In reality, one should always prefer to use StrTokens for strings: let chars: &mut dyn Iterator = &mut "hello \n\t world".chars(); @@ -237,36 +271,25 @@ mod tests { } #[test] - #[cfg(feature = "alloc")] - fn non_fused() { - let mut it1 = "hello".chars(); - let mut it2 = "world".chars(); - let mut once = true; - let it = core::iter::from_fn(|| { - if let Some(x) = it1.next() { - Some(x) - } else if once { - once = false; - None - } else { - it2.next() - } - }); - let mut tokens = StreamTokens::new(it); + fn str_stream_tokens_sanity_check() { + // In reality, one should always prefer to use StrTokens for strings: + let chars: &mut dyn Iterator = &mut "hello \n\t world".chars(); + // Can't `chars.clone()` so: + let mut tokens = crate::StrStreamTokens::new(chars); + + let loc = tokens.location(); assert!(tokens.tokens("hello".chars())); - let none_next = tokens.location(); - assert_eq!(tokens.next(), None); - assert!(tokens.tokens("world".chars())); - assert_eq!(tokens.next(), None); - assert_eq!(tokens.next(), None); - assert_eq!(tokens.next(), None); + tokens.set_location(loc.clone()); + assert!(tokens.tokens("hello".chars())); + + tokens.skip_while(|c| c.is_whitespace()); - tokens.set_location(none_next); - assert_eq!(tokens.next(), None); assert!(tokens.tokens("world".chars())); - assert_eq!(tokens.next(), None); - assert_eq!(tokens.next(), None); - assert_eq!(tokens.next(), None); + + tokens.set_location(loc); + assert!(tokens.tokens("hello \n\t world".chars())); + + assert_eq!(None, tokens.next()) } } diff --git a/src/stream_tokens/str_stream_tokens.rs b/src/stream_tokens/str_stream_tokens.rs new file mode 100644 index 0000000..d280d10 --- /dev/null +++ b/src/stream_tokens/str_stream_tokens.rs @@ -0,0 +1,246 @@ +use super::StreamTokensBuffer; +use crate::StreamTokens; +use alloc::string::String; +use yap::Tokens; + +/// [`StrStreamTokens`] is like [`StreamTokens`] but optimized for more efficient usage of [`Tokens::parse()`] and related methods when wrapping `Iterator`. +/// +/// See [`Self::new`] for example usage. +#[derive(Debug)] +pub struct StrStreamTokens< + I: Iterator, + Buffer: StreamTokensBuffer + core::ops::Deref, +>(StreamTokens); + +impl StreamTokensBuffer for String { + fn drain_front(&mut self, n: usize) { + if n > self.len() { + self.clear() + } else { + self.drain(..n).for_each(drop); + } + } + + fn push(&mut self, item: char) { + self.push(item) + } + + fn get(&self, idx: usize) -> Option { + self.chars().nth(idx) + } +} + +impl StrStreamTokens +where + I: Iterator, + I::Item: Clone, +{ + /// Use this method to convert a suitable iterator into [`Tokens`]. + /// + /// # Example + /// + /// ```rust + /// use yap_streaming::{Tokens, StrStreamTokens}; + /// + /// // In normal usage, "hello \n\t world".into_tokens() + /// // would be preferred here (which would give StrTokens). + /// // This is just to demonstrate using StrStreamTokens: + /// let chars_iter = "hello \n\t world123".chars(); + /// let mut tokens = StrStreamTokens::new(chars_iter); + /// + /// // now we have tokens, we can do some parsing: + /// assert!(tokens.tokens("hello".chars())); + /// tokens.skip_while(|c| c.is_whitespace()); + /// assert!(tokens.tokens("world".chars())); + /// + /// // And parsing can be efficiently achieved: + /// assert_eq!(tokens.parse::(), Ok(123)); + /// ``` + pub fn new(iter: I) -> Self { + Self(StreamTokens::_new(iter)) + } +} + +impl Tokens for StrStreamTokens +where + I: Iterator, + I::Item: Clone, + Buffer: StreamTokensBuffer + core::ops::Deref, +{ + type Item = as Tokens>::Item; + + type Location = as Tokens>::Location; + + fn next(&mut self) -> Option { + self.0.next() + } + + fn location(&self) -> Self::Location { + self.0.location() + } + + fn set_location(&mut self, location: Self::Location) { + self.0.set_location(location) + } + + fn is_at_location(&self, location: &Self::Location) -> bool { + self.0.is_at_location(location) + } + + fn parse(&mut self) -> Result::Err> + where + Out: core::str::FromStr, + Buf: FromIterator + core::ops::Deref, + { + // Fill rest of buffer with the wrapped stream before parsing everything. + let from = self.location(); + while let Some(_) = self.0.next() {} + // Parse everything. + let res = self.0.buffer.elements[from.cursor - self.0.buffer.oldest_elem_cursor..].parse(); + // Reset location on error. + if res.is_err() { + self.set_location(from) + }; + res + } + fn parse_slice( + &mut self, + from: Self::Location, + to: Self::Location, + ) -> Result::Err> + where + Out: core::str::FromStr, + Buf: FromIterator + core::ops::Deref, + { + self.0.buffer.elements[from.cursor - self.0.buffer.oldest_elem_cursor + ..to.cursor - self.0.buffer.oldest_elem_cursor] + .parse() + } + fn parse_take(&mut self, n: usize) -> Result::Err> + where + Out: core::str::FromStr, + Buf: FromIterator + core::ops::Deref, + { + // Consume the n tokens. + let from = self.location(); + self.take(n).consume(); + + let res = self.0.buffer.elements[from.cursor - self.0.buffer.oldest_elem_cursor + ..self.0.cursor - self.0.buffer.oldest_elem_cursor] + .parse(); + + // Reset location on error. + if res.is_err() { + self.set_location(from); + } + res + } + fn parse_take_while( + &mut self, + take_while: F, + ) -> Result::Err> + where + Out: core::str::FromStr, + Buf: FromIterator + core::ops::Deref, + F: FnMut(&Self::Item) -> bool, + { + // Consume all of the tokens matching the function. + let from = self.location(); + self.take_while(take_while).consume(); + + let res = self.0.buffer.elements[from.cursor - self.0.buffer.oldest_elem_cursor + ..self.0.cursor - self.0.buffer.oldest_elem_cursor] + .parse(); + + // Reset location on error. + if res.is_err() { + self.set_location(from); + } + res + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn str_tokens_parse_optimizations_work() { + // This buffer will panic if it's used. + struct BadBuffer; + impl core::iter::FromIterator for BadBuffer { + fn from_iter>(_: T) -> Self { + panic!("FromIterator impl shouldn't be used") + } + } + impl core::ops::Deref for BadBuffer { + type Target = str; + fn deref(&self) -> &Self::Target { + panic!("Deref impl shouldn't be used") + } + } + + // 0. parse() + + let mut tokens = StrStreamTokens::new("123".chars()); + + assert_eq!(tokens.parse::<_, BadBuffer>(), Ok(123)); + + // 1. slice(..).parse() + + let mut tokens = StrStreamTokens::new("123abc".chars()); + + // Find locations to the number: + let from = tokens.location(); + tokens.take_while(|t| t.is_numeric()).consume(); + let to = tokens.location(); + + let n = tokens + .slice(from, to) + .parse::() + .expect("parse worked (1)"); + + assert_eq!(n, 123); + assert_eq!(tokens.collect::(), "abc"); + + // 2. take(..).parse() + + let mut tokens = StrStreamTokens::new("123abc".chars()); + + let n = tokens + .take(3) + .parse::() + .expect("parse worked (2)"); + + assert_eq!(n, 123); + assert_eq!(tokens.collect::(), "abc"); + + // 3. take_while(..).parse() + + let mut tokens = StrStreamTokens::new("123abc".chars()); + + let n = tokens + .take_while(|t| t.is_numeric()) + .parse::() + .expect("parse worked (3)"); + + assert_eq!(n, 123); + assert_eq!(tokens.collect::(), "abc"); + + // 4. take(..).take_while(..).take(..).parse() + + let mut tokens = StrStreamTokens::new("123ab+=".chars()); + + let n = tokens + .take(6) + .take(5) + .take_while(|t| t.is_alphanumeric()) + .take_while(|t| t.is_numeric()) + .take(2) + .parse::() + .expect("parse worked (4)"); + + assert_eq!(n, 12); + assert_eq!(tokens.collect::(), "3ab+="); + } +}