Try out winnow

bevyengine · Apr 28, 2024 · 3771d6c · 3771d6c
1 parent 33e57e4
commit 3771d6c
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 90 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -29,6 +29,7 @@ rustc-hash = "1.1.0"
 unicode-ident = "1"
 once_cell = "1.17.0"
 indexmap = "2"
+winnow = "0.6.7"
 
 [dev-dependencies]
 wgpu = { version = "0.19.0", features = ["naga-ir"] }

diff --git a/src/compose/comment_strip_iter.rs b/src/compose/comment_strip_iter.rs
@@ -1,90 +1,178 @@
-use std::{borrow::Cow, str::Lines};
+use std::{borrow::Cow, ops::Range};
+
+use winnow::{
+    ascii::till_line_ending,
+    combinator::{cut_err, opt},
+    error::StrContext,
+    token::any,
+    Located, PResult, Parser,
+};
+
+struct SourceCode {
+    /** Sorted pieces of the source code without any gaps */
+    parts: Vec<SourceCodePart>,
+}
+
+enum SourceCodePart {
+    Text(Range<usize>),
+    SingleLineComment(SingleLineComment),
+    MultiLineComment(MultiLineComment),
+}
+
+impl SourceCodePart {
+    fn span(&self) -> Range<usize> {
+        match self {
+            SourceCodePart::Text(span) => span.clone(),
+            SourceCodePart::SingleLineComment(comment) => comment.span.clone(),
+            SourceCodePart::MultiLineComment(comment) => comment.span.clone(),
+        }
+    }
+}
 
-use regex::Regex;
+pub struct SingleLineComment {
+    pub span: Range<usize>,
+}
+pub struct MultiLineComment {
+    pub span: Range<usize>,
+}
 
-static RE_COMMENT: once_cell::sync::Lazy<Regex> =
-    once_cell::sync::Lazy::new(|| Regex::new(r"(//|/\*|\*/)").unwrap());
+fn parse_source(input: &mut Located<&str>) -> PResult<SourceCode> {
+    let mut parts = Vec::new();
+    loop {
+        if input.is_empty() {
+            break;
+        }
+        if let Some(part) = opt(single_line_comment).parse_next(input)? {
+            parts.push(SourceCodePart::SingleLineComment(part));
+        } else if let Some(part) = opt(multi_line_comment).parse_next(input)? {
+            parts.push(SourceCodePart::MultiLineComment(part));
+        } else {
+            let text_span = any.span().parse_next(input)?;
+            if let Some(SourceCodePart::Text(last_span)) = parts.last_mut() {
+                last_span.end = text_span.end;
+            } else {
+                parts.push(SourceCodePart::Text(text_span));
+            }
+        }
+    }
+    Ok(SourceCode { parts })
+}
+
+fn single_line_comment(input: &mut Located<&str>) -> PResult<SingleLineComment> {
+    let start_span = "//".span().parse_next(input)?;
+    let text_span = till_line_ending.span().parse_next(input)?;
+    Ok(SingleLineComment {
+        span: start_span.start..text_span.end,
+    })
+}
+fn multi_line_comment(input: &mut Located<&str>) -> PResult<MultiLineComment> {
+    let start_span = "/*".span().parse_next(input)?;
+    loop {
+        if let Some(end_span) = opt("*/".span()).parse_next(input)? {
+            return Ok(MultiLineComment {
+                span: start_span.start..end_span.end,
+            });
+        } else if let Some(_) = opt(multi_line_comment).parse_next(input)? {
+            // We found a nested comment, skip it
+        } else {
+            // Skip a single character
+            let _ = cut_err(any)
+                .context(StrContext::Label("multiline comment"))
+                .parse_next(input)?;
+        }
+    }
+}
 
 pub struct CommentReplaceIter<'a> {
-    lines: &'a mut Lines<'a>,
-    block_depth: usize,
+    text: &'a str,
+    text_index: usize,
+    parsed: SourceCode,
+    parsed_index: usize,
+}
+
+fn clamp_range(range: Range<usize>, min: usize, max: usize) -> Range<usize> {
+    range.start.clamp(min, max)..range.end.clamp(min, max)
 }
 
 impl<'a> Iterator for CommentReplaceIter<'a> {
-    type Item = Cow<'a, str>;
+    type Item = (Cow<'a, str>, &'a str);
 
     fn next(&mut self) -> Option<Self::Item> {
-        let line_in = self.lines.next()?;
-        let mut markers = RE_COMMENT
-            .captures_iter(line_in)
-            .map(|cap| cap.get(0).unwrap())
-            .peekable();
-
-        // fast path
-        if self.block_depth == 0 && markers.peek().is_none() {
-            return Some(Cow::Borrowed(line_in));
+        if self.text_index >= self.text.len() {
+            return None;
         }
 
-        let mut output = String::new();
-        let mut section_start = 0;
-
-        loop {
-            let mut next_marker = markers.next();
-            let mut section_end = next_marker.map(|m| m.start()).unwrap_or(line_in.len());
-
-            // skip partial tokens
-            while next_marker.is_some() && section_start > section_end {
-                next_marker = markers.next();
-                section_end = next_marker.map(|m| m.start()).unwrap_or(line_in.len());
+        let line_start = self.text_index;
+        let line_end = self.text[line_start..]
+            .find('\n') // TODO: Handle \r\n
+            .map(|i| line_start + i + 1)
+            .unwrap_or_else(|| self.text.len());
+        self.text_index = line_end;
+
+        let mut parts = Vec::new();
+        for (i, parsed_part) in self.parsed.parts.iter().enumerate().skip(self.parsed_index) {
+            let span = parsed_part.span();
+            if span.start >= line_end {
+                break;
+            }
+            if span.end <= line_start {
+                self.parsed_index = i + 1;
+                continue;
             }
+            parts.push((parsed_part, clamp_range(span, line_start, line_end)));
+        }
 
-            if self.block_depth == 0 {
-                output.push_str(&line_in[section_start..section_end]);
-            } else {
-                output.extend(std::iter::repeat(' ').take(section_end - section_start));
+        assert!(parts.len() > 0);
+
+        // Fast path
+        if parts.len() == 1 {
+            match parts.into_iter().next().unwrap() {
+                (SourceCodePart::Text(_), span) => {
+                    return Some((
+                        Cow::Borrowed(&self.text[span]),
+                        &self.text[line_start..line_end],
+                    ));
+                }
+                (
+                    SourceCodePart::SingleLineComment(_) | SourceCodePart::MultiLineComment(_),
+                    span,
+                ) => {
+                    let spaces = " ".repeat(span.len());
+                    return Some((Cow::Owned(spaces), &self.text[line_start..line_end]));
+                }
             }
+        }
 
-            match next_marker {
-                None => return Some(Cow::Owned(output)),
-                Some(marker) => {
-                    match marker.as_str() {
-                        "//" => {
-                            // the specs (https://www.w3.org/TR/WGSL/#comment, https://registry.khronos.org/OpenGL/specs/gl/GLSLangSpec.4.60.pdf @ 3.4) state that
-                            // whichever comment-type starts first should cancel parsing of the other type
-                            if self.block_depth == 0 {
-                                output.extend(
-                                    std::iter::repeat(' ').take(line_in.len() - marker.start()),
-                                );
-                                return Some(Cow::Owned(output));
-                            }
-                        }
-                        "/*" => {
-                            self.block_depth += 1;
-                        }
-                        "*/" => {
-                            self.block_depth = self.block_depth.saturating_sub(1);
-                        }
-                        _ => unreachable!(),
-                    }
-                    output.extend(std::iter::repeat(' ').take(marker.as_str().len()));
-                    section_start = marker.end();
+        let mut output = String::new();
+        let mut last_end = line_start;
+        for (part, span) in parts.into_iter() {
+            output.push_str(&self.text[last_end..span.start]);
+            last_end = span.end;
+            match part {
+                SourceCodePart::Text(_) => {
+                    output.push_str(&self.text[span]);
+                }
+                SourceCodePart::SingleLineComment(_) | SourceCodePart::MultiLineComment(_) => {
+                    output.extend(std::iter::repeat(' ').take(span.len()));
                 }
             }
         }
-    }
-}
 
-pub trait CommentReplaceExt<'a> {
-    /// replace WGSL and GLSL comments with whitespace characters
-    fn replace_comments(&'a mut self) -> CommentReplaceIter;
+        assert!(last_end == line_end);
+        Some((Cow::Owned(output), &self.text[line_start..line_end]))
+    }
 }
 
-impl<'a> CommentReplaceExt<'a> for Lines<'a> {
-    fn replace_comments(&'a mut self) -> CommentReplaceIter {
-        CommentReplaceIter {
-            lines: self,
-            block_depth: 0,
-        }
+/// Gives you an iterator that replaces comments in the input text with spaces.
+/// The iterator will yield the same lines as the input text, but with comments replaced.
+/// Lines will include the newline character at the end!
+pub fn replace_comments(input: &str) -> CommentReplaceIter {
+    let parsed = parse_source(&mut Located::new(input)).unwrap();
+    CommentReplaceIter {
+        text: input,
+        text_index: 0,
+        parsed,
+        parsed_index: 0,
     }
 }
 
@@ -108,14 +196,10 @@ not commented
 ";
 
     assert_eq!(
-        INPUT
-            .lines()
-            .replace_comments()
-            .zip(INPUT.lines())
-            .find(|(line, original)| {
-                (line != "not commented" && !line.chars().all(|c| c == ' '))
-                    || line.len() != original.len()
-            }),
+        replace_comments(INPUT).find(|(line, original)| {
+            (line.trim_end() != "not commented" && !line.chars().all(|c| c == ' ' || c == '\n'))
+                || line.len() != original.len()
+        }),
         None
     );
 
@@ -139,8 +223,7 @@ not commented
     ];
 
     for &(input, expected) in PARTIAL_TESTS.iter() {
-        let mut nasty_processed = input.lines();
-        let nasty_processed = nasty_processed.replace_comments().next().unwrap();
+        let nasty_processed = replace_comments(input).next().unwrap().0;
         assert_eq!(&nasty_processed, expected);
     }
 }
diff --git a/src/compose/preprocess.rs b/src/compose/preprocess.rs
@@ -4,7 +4,7 @@ use indexmap::IndexMap;
 use regex::Regex;
 
 use super::{
-    comment_strip_iter::CommentReplaceExt,
+    comment_strip_iter::replace_comments,
     parse_imports::{parse_imports, substitute_identifiers},
     ComposerErrorInner, ImportDefWithOffset, ShaderDefValue,
 };
@@ -245,8 +245,7 @@ impl Preprocessor {
         let len = shader_str.len();
 
         // this code broadly stolen from bevy_render::ShaderProcessor
-        let mut lines = shader_str.lines();
-        let mut lines = lines.replace_comments().zip(shader_str.lines()).peekable();
+        let mut lines = replace_comments(shader_str).peekable();
 
         while let Some((mut line, original_line)) = lines.next() {
             let mut output = false;
@@ -271,8 +270,9 @@ impl Preprocessor {
 
                     loop {
                         // output spaces for removed lines to keep spans consistent (errors report against substituted_source, which is not preprocessed)
-                        final_string.extend(std::iter::repeat(" ").take(line.len()));
-                        offset += line.len() + 1;
+                        final_string
+                            .extend(std::iter::repeat(" ").take(line.len().saturating_sub(1)));
+                        offset += line.len();
 
                         // PERF: Ideally we don't do multiple `match_indices` passes over `line`
                         // in addition to the final pass for the import parse
@@ -283,7 +283,6 @@ impl Preprocessor {
                         //     let import_lines = &shader_str[initial_offset..offset]
                         // but we need the comments removed, and the iterator approach doesn't make that easy
                         import_lines.push_str(&line);
-                        import_lines.push('\n');
 
                         if open_count == 0 || lines.peek().is_none() {
                             break;
@@ -356,15 +355,15 @@ impl Preprocessor {
                     final_string.push_str(&item_replaced_line);
                     let diff = line.len().saturating_sub(item_replaced_line.len());
                     final_string.extend(std::iter::repeat(" ").take(diff));
-                    offset += original_line.len() + 1;
+                    offset += original_line.len();
                     output = true;
                 }
             }
 
             if !output {
                 // output spaces for removed lines to keep spans consistent (errors report against substituted_source, which is not preprocessed)
-                final_string.extend(std::iter::repeat(" ").take(line.len()));
-                offset += line.len() + 1;
+                final_string.extend(std::iter::repeat(" ").take(line.len().saturating_sub(1)));
+                offset += line.len();
             }
             final_string.push('\n');
         }
@@ -398,10 +397,9 @@ impl Preprocessor {
         let mut defines = HashMap::default();
         let mut effective_defs = HashSet::default();
 
-        let mut lines = shader_str.lines();
-        let mut lines = lines.replace_comments().peekable();
+        let mut lines = replace_comments(shader_str).peekable();
 
-        while let Some(mut line) = lines.next() {
+        while let Some((mut line, _)) = lines.next() {
             let (is_scope, def) = self.check_scope(&HashMap::default(), &line, None, offset)?;
 
             if is_scope {
@@ -423,7 +421,6 @@ impl Preprocessor {
                     //     let import_lines = &shader_str[initial_offset..offset]
                     // but we need the comments removed, and the iterator approach doesn't make that easy
                     import_lines.push_str(&line);
-                    import_lines.push('\n');
 
                     if open_count == 0 || lines.peek().is_none() {
                         break;
@@ -432,7 +429,7 @@ impl Preprocessor {
                     // output spaces for removed lines to keep spans consistent (errors report against substituted_source, which is not preprocessed)
                     offset += line.len() + 1;
 
-                    line = lines.next().unwrap();
+                    line = lines.next().unwrap().0;
                 }
 
                 parse_imports(import_lines.as_str(), &mut declared_imports).map_err(