feat: Allow specifying token match threshold for delimited recovery

This is required to resolve syntax ambiguities, since we can't always safely attempt recovering from lookahead of 2 and more, by default.
Xanewok · Mar 4, 2024 · 71f3d1b · 71f3d1b
1 parent 8f3e841
commit 71f3d1b
Show file tree

Hide file tree

Showing 21 changed files with 238 additions and 202 deletions.
diff --git a/crates/codegen/grammar/src/constructor.rs b/crates/codegen/grammar/src/constructor.rs
@@ -10,7 +10,7 @@ use codegen_language_definition::model::{self, FieldsErrorRecovery, Identifier,
 use indexmap::IndexMap;
 
 use crate::{
-    DelimitedRecoveryOpts, Grammar, GrammarElement, KeywordScannerDefinition,
+    DelimitedRecoveryTokenThreshold, Grammar, GrammarElement, KeywordScannerDefinition,
     KeywordScannerDefinitionNode, KeywordScannerDefinitionVersionedNode, Labeled, ParserDefinition,
     ParserDefinitionNode, PrecedenceOperatorModel, PrecedenceParserDefinition,
     PrecedenceParserDefinitionNode, ScannerDefinition, ScannerDefinitionNode,
@@ -602,8 +602,8 @@ fn resolve_sequence_like(
             let open = delims.next().unwrap();
             let close = delims.next().unwrap();
 
-            let opts = DelimitedRecoveryOpts::from(delimiters);
-            ParserDefinitionNode::DelimitedBy(open, Box::new(delimited_body), close, opts)
+            let threshold = DelimitedRecoveryTokenThreshold::from(delimiters);
+            ParserDefinitionNode::DelimitedBy(open, Box::new(delimited_body), close, threshold)
         };
         // Replace with a new delimited node
         fields.insert(

diff --git a/crates/codegen/grammar/src/parser_definition.rs b/crates/codegen/grammar/src/parser_definition.rs
@@ -55,22 +55,22 @@ impl Visitable for TriviaParserDefinitionRef {
     }
 }
 
+/// How many tokens have to be matched to trigger the error recovery.
+/// For ambiguous syntaxes this needs to be set to at least N, where N
+/// is the token lookahead required to disambiguate the syntax.
+///
+// By default, we assume no lookahead is required to recover from
+// unrecognized body between delimiters.
 #[derive(Clone, Debug, Default)]
-pub struct DelimitedRecoveryOpts {
-    /// Whether completely unmatched body between the delimiters should
-    /// prevent the the error recovery from being applied.
-    /// This is generally safe but sometimes needs to be disabled if the
-    /// recovery would lead to a misparse in case of ambiguous input.
-    pub disallow_unmatched_body: bool,
-}
+pub struct DelimitedRecoveryTokenThreshold(pub u8);
 
-impl From<model::FieldDelimiters> for DelimitedRecoveryOpts {
+impl From<model::FieldDelimiters> for DelimitedRecoveryTokenThreshold {
     fn from(delimiters: model::FieldDelimiters) -> Self {
-        Self {
-            disallow_unmatched_body: delimiters
-                .disallow_unmatched_body
-                .unwrap_or(DelimitedRecoveryOpts::default().disallow_unmatched_body),
-        }
+        Self(
+            delimiters
+                .tokens_matched_acceptance_threshold
+                .unwrap_or(DelimitedRecoveryTokenThreshold::default().0),
+        )
     }
 }
 
@@ -91,7 +91,7 @@ pub enum ParserDefinitionNode {
         Labeled<Box<Self>>,
         Box<Self>,
         Labeled<Box<Self>>,
-        DelimitedRecoveryOpts,
+        DelimitedRecoveryTokenThreshold,
     ),
     SeparatedBy(Labeled<Box<Self>>, Labeled<Box<Self>>),
     TerminatedBy(Box<Self>, Labeled<Box<Self>>),

diff --git a/crates/codegen/language/definition/src/internals/parse_input_tokens/external_types.rs b/crates/codegen/language/definition/src/internals/parse_input_tokens/external_types.rs
@@ -140,6 +140,15 @@ impl ParseInputTokens for usize {
     }
 }
 
+impl ParseInputTokens for u8 {
+    fn parse_value(input: ParseStream<'_>, _: &mut ErrorsCollection) -> Result<Self> {
+        let literal = ParseHelpers::syn::<syn::LitInt>(input)?;
+        let value = literal.base10_parse::<u8>()?;
+
+        Ok(value)
+    }
+}
+
 impl<T: ParseInputTokens> ParseInputTokens for Vec<T> {
     fn parse_value(input: ParseStream<'_>, errors: &mut ErrorsCollection) -> Result<Self> {
         Ok(ParseHelpers::sequence(input, errors))

diff --git a/crates/codegen/language/definition/src/internals/write_output_tokens/external_types.rs b/crates/codegen/language/definition/src/internals/write_output_tokens/external_types.rs
@@ -119,6 +119,16 @@ impl WriteOutputTokens for usize {
     }
 }
 
+impl WriteOutputTokens for u8 {
+    fn write_output_tokens(&self) -> TokenStream {
+        let value = Literal::u8_suffixed(*self);
+
+        quote! {
+            #value.into()
+        }
+    }
+}
+
 impl<T: WriteOutputTokens> WriteOutputTokens for Vec<T> {
     fn write_output_tokens(&self) -> TokenStream {
         let items = self.iter().map(T::write_output_tokens);

diff --git a/crates/codegen/language/definition/src/model/non_terminals/field.rs b/crates/codegen/language/definition/src/model/non_terminals/field.rs
@@ -15,12 +15,11 @@ pub struct FieldsErrorRecovery {
 pub struct FieldDelimiters {
     pub open: Identifier,
     pub close: Identifier,
-    /// Whether completely unmatched body between the delimiters should
-    /// prevent the the error recovery from being applied.
-    ///
-    /// This is generally safe but sometimes needs to be disabled if the
-    /// recovery would lead to a misparse in case of ambiguous input.
     pub disallow_unmatched_body: Option<bool>,
+    /// How many tokens have to be matched to trigger the error recovery.
+    /// For ambiguous syntaxes this needs to be set to at least N, where N
+    /// is the token lookahead required to disambiguate the syntax.
+    pub tokens_matched_acceptance_threshold: Option<u8>,
 }
 
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]

diff --git a/crates/codegen/language/internal_macros/src/derive/spanned.rs b/crates/codegen/language/internal_macros/src/derive/spanned.rs
@@ -159,7 +159,7 @@ fn get_spanned_type(input: Type) -> Type {
         }
 
         // External types should also be wrapped in 'Spanned<T>':
-        "bool" | "char" | "PathBuf" | "String" | "Version" => {
+        "bool" | "u8" | "char" | "PathBuf" | "String" | "Version" => {
             parse_quote! {
                 crate::internals::Spanned<#input>
             }

diff --git a/crates/codegen/parser/generator/src/parser_definition.rs b/crates/codegen/parser/generator/src/parser_definition.rs
@@ -191,7 +191,7 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
                 quote! { self.#function_name(input) }
             }
 
-            Self::DelimitedBy(open, body, close, opts) => {
+            Self::DelimitedBy(open, body, close, threshold) => {
                 let open_label = format_ident!("{}", open.label.to_pascal_case());
                 let close_label = format_ident!("{}", close.label.to_pascal_case());
                 let [open_delim, close_delim] = match (open.as_ref(), close.as_ref()) {
@@ -201,11 +201,7 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
                     ) => [open, close].map(|scanner| format_ident!("{}", scanner.name())),
                     _ => unreachable!("Only tokens are permitted as delimiters"),
                 };
-                let recover = if opts.disallow_unmatched_body {
-                    quote! { RecoverFromNoMatch::No }
-                } else {
-                    quote! { RecoverFromNoMatch::Yes }
-                };
+                let threshold = threshold.0;
 
                 let parser = body.to_parser_code(context_name, is_trivia);
                 let body_parser = body.applicable_version_quality_ranges().wrap_code(
@@ -214,7 +210,7 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
                             .recover_until_with_nested_delims::<_, #lex_ctx>(input,
                                 self,
                                 TokenKind::#close_delim,
-                                #recover,
+                                TokenAcceptanceThreshold(#threshold),
                             )
                         )?;
                     },
@@ -280,7 +276,8 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
                             .recover_until_with_nested_delims::<_, #lex_ctx>(input,
                                 self,
                                 TokenKind::#terminator,
-                                RecoverFromNoMatch::No,
+                                 // Requires at least a partial match not to risk misparsing
+                                TokenAcceptanceThreshold(1u8),
                             )
                         )?;
                     },

diff --git a/crates/codegen/parser/runtime/src/parser_support/mod.rs b/crates/codegen/parser/runtime/src/parser_support/mod.rs
@@ -26,7 +26,7 @@ pub(crate) use parser_result::ParserResult;
 #[allow(unused_imports)]
 pub(crate) use precedence_helper::PrecedenceHelper;
 #[allow(unused_imports)]
-pub(crate) use recovery::RecoverFromNoMatch;
+pub(crate) use recovery::TokenAcceptanceThreshold;
 #[allow(unused_imports)]
 pub(crate) use repetition_helper::{OneOrMoreHelper, ZeroOrMoreHelper};
 #[allow(unused_imports)]

diff --git a/crates/codegen/parser/runtime/src/parser_support/parser_result.rs b/crates/codegen/parser/runtime/src/parser_support/parser_result.rs
@@ -1,6 +1,6 @@
 use std::ops::ControlFlow;
 
-use crate::cst::{self, LabeledNode};
+use crate::cst::{self, LabeledNode, Node};
 use crate::kinds::{NodeLabel, RuleKind, TokenKind};
 use crate::text_index::TextIndex;
 
@@ -201,6 +201,33 @@ impl IncompleteMatch {
             expected_tokens,
         }
     }
+
+    /// Whether this prefix-matched at least `n` (non-skipped) tokens.
+    pub fn matches_at_least_n_tokens(&self, n: u8) -> bool {
+        let result = self
+            .nodes
+            .iter()
+            .flat_map(|node| node.cursor_with_offset(TextIndex::ZERO))
+            .try_fold(0u8, |mut acc, node| {
+                match node {
+                    Node::Token(tok) if tok.kind != TokenKind::SKIPPED => {
+                        acc += 1;
+                    }
+                    _ => {}
+                }
+
+                // Short-circuit not to walk the whole tree if we've already matched enough
+                if acc >= n {
+                    ControlFlow::Break(acc)
+                } else {
+                    ControlFlow::Continue(acc)
+                }
+            });
+
+        match result {
+            ControlFlow::Continue(value) | ControlFlow::Break(value) => value >= n,
+        }
+    }
 }
 
 #[derive(PartialEq, Eq, Clone, Debug)]

diff --git a/crates/codegen/parser/runtime/src/parser_support/recovery.rs b/crates/codegen/parser/runtime/src/parser_support/recovery.rs
@@ -7,18 +7,11 @@ use crate::parser_support::parser_result::SkippedUntil;
 use crate::parser_support::ParserResult;
 use crate::text_index::{TextRange, TextRangeExtensions as _};
 
-/// An explicit parameter for the [`ParserResult::recover_until_with_nested_delims`] method.
+/// How many tokens have to be matched to trigger the error recovery.
+/// For ambiguous syntaxes this needs to be set to at least N, where N
+/// is the token lookahead required to disambiguate the syntax.
 #[derive(Clone, Copy)]
-pub(crate) enum RecoverFromNoMatch {
-    Yes,
-    No,
-}
-
-impl RecoverFromNoMatch {
-    pub fn as_bool(self) -> bool {
-        matches!(self, RecoverFromNoMatch::Yes)
-    }
-}
+pub(crate) struct TokenAcceptanceThreshold(pub(crate) u8);
 
 fn opt_parse(
     input: &mut ParserContext<'_>,
@@ -46,7 +39,7 @@ impl ParserResult {
         input: &mut ParserContext<'_>,
         lexer: &L,
         expected: TokenKind,
-        recover_from_no_match: RecoverFromNoMatch,
+        acceptance_threshold: TokenAcceptanceThreshold,
     ) -> ParserResult {
         enum ParseResultKind {
             Match,
@@ -57,11 +50,15 @@ impl ParserResult {
         let before_recovery = input.position();
 
         let (mut nodes, mut expected_tokens, result_kind) = match self {
-            ParserResult::IncompleteMatch(result) => (
-                result.nodes,
-                result.expected_tokens,
-                ParseResultKind::Incomplete,
-            ),
+            ParserResult::IncompleteMatch(result)
+                if result.matches_at_least_n_tokens(acceptance_threshold.0) =>
+            {
+                (
+                    result.nodes,
+                    result.expected_tokens,
+                    ParseResultKind::Incomplete,
+                )
+            }
             ParserResult::Match(result)
                 if lexer
                     .peek_token_with_trivia::<LexCtx>(input)
@@ -70,7 +67,7 @@ impl ParserResult {
             {
                 (result.nodes, result.expected_tokens, ParseResultKind::Match)
             }
-            ParserResult::NoMatch(result) if recover_from_no_match.as_bool() => {
+            ParserResult::NoMatch(result) if acceptance_threshold.0 == 0 => {
                 (vec![], result.expected_tokens, ParseResultKind::NoMatch)
             }
             // No need to recover, so just return as-is.

diff --git a/crates/codegen/parser/runtime/src/templates/language.rs.jinja2 b/crates/codegen/parser/runtime/src/templates/language.rs.jinja2
@@ -22,7 +22,7 @@ use crate::napi_interface::parse_output::ParseOutput as NAPIParseOutput;
 use crate::parse_output::ParseOutput;
 use crate::parser_support::{
     ChoiceHelper, OneOrMoreHelper, OptionalHelper, ParserContext, ParserFunction, ParserResult,
-    PrecedenceHelper, RecoverFromNoMatch, SeparatedHelper, SequenceHelper, ZeroOrMoreHelper,
+    PrecedenceHelper, SeparatedHelper, SequenceHelper, TokenAcceptanceThreshold, ZeroOrMoreHelper,
 };
 
 #[derive(Debug)]

diff --git a/crates/solidity/inputs/language/src/definition.rs b/crates/solidity/inputs/language/src/definition.rs
@@ -3349,10 +3349,10 @@ codegen_language_macros::compile!(Language(
                                                 open = open_brace,
                                                 close = close_brace,
                                                 // NOTE: Despite `NamedArguments` requiring at least one element,
-                                                // we need to disable attempting to recover from empty elements,
-                                                // because this postfix is ambiguous with `try <EXPR> {} catch {}`
-                                                // and could lead to incorrect parsing if we recover past valid syntax.
-                                                disallow_unmatched_body = true
+                                                // we can only recover if we found at least two tokens (`ident:`)
+                                                // in the body, as this may be otherwise ambiguous with
+                                                // `try <EXPR> { func() } catch {}`.
+                                                tokens_matched_acceptance_threshold = 2
                                             )
                                         ),
                                         fields = (