Skip to content

Commit

Permalink
feat: Allow specifying token match threshold for delimited recovery
Browse files Browse the repository at this point in the history
This is required to resolve syntax ambiguities, since we can't always
safely attempt recovering from lookahead of 2 and more, by default.
  • Loading branch information
Xanewok committed Mar 4, 2024
1 parent 8f3e841 commit 71f3d1b
Show file tree
Hide file tree
Showing 21 changed files with 238 additions and 202 deletions.
6 changes: 3 additions & 3 deletions crates/codegen/grammar/src/constructor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use codegen_language_definition::model::{self, FieldsErrorRecovery, Identifier,
use indexmap::IndexMap;

use crate::{
DelimitedRecoveryOpts, Grammar, GrammarElement, KeywordScannerDefinition,
DelimitedRecoveryTokenThreshold, Grammar, GrammarElement, KeywordScannerDefinition,
KeywordScannerDefinitionNode, KeywordScannerDefinitionVersionedNode, Labeled, ParserDefinition,
ParserDefinitionNode, PrecedenceOperatorModel, PrecedenceParserDefinition,
PrecedenceParserDefinitionNode, ScannerDefinition, ScannerDefinitionNode,
Expand Down Expand Up @@ -602,8 +602,8 @@ fn resolve_sequence_like(
let open = delims.next().unwrap();
let close = delims.next().unwrap();

let opts = DelimitedRecoveryOpts::from(delimiters);
ParserDefinitionNode::DelimitedBy(open, Box::new(delimited_body), close, opts)
let threshold = DelimitedRecoveryTokenThreshold::from(delimiters);
ParserDefinitionNode::DelimitedBy(open, Box::new(delimited_body), close, threshold)
};
// Replace with a new delimited node
fields.insert(
Expand Down
28 changes: 14 additions & 14 deletions crates/codegen/grammar/src/parser_definition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,22 @@ impl Visitable for TriviaParserDefinitionRef {
}
}

/// How many tokens have to be matched to trigger the error recovery.
/// For ambiguous syntaxes this needs to be set to at least N, where N
/// is the token lookahead required to disambiguate the syntax.
///
// By default, we assume no lookahead is required to recover from
// unrecognized body between delimiters.
#[derive(Clone, Debug, Default)]
pub struct DelimitedRecoveryOpts {
/// Whether completely unmatched body between the delimiters should
/// prevent the the error recovery from being applied.
/// This is generally safe but sometimes needs to be disabled if the
/// recovery would lead to a misparse in case of ambiguous input.
pub disallow_unmatched_body: bool,
}
pub struct DelimitedRecoveryTokenThreshold(pub u8);

impl From<model::FieldDelimiters> for DelimitedRecoveryOpts {
impl From<model::FieldDelimiters> for DelimitedRecoveryTokenThreshold {
fn from(delimiters: model::FieldDelimiters) -> Self {
Self {
disallow_unmatched_body: delimiters
.disallow_unmatched_body
.unwrap_or(DelimitedRecoveryOpts::default().disallow_unmatched_body),
}
Self(
delimiters
.tokens_matched_acceptance_threshold
.unwrap_or(DelimitedRecoveryTokenThreshold::default().0),
)
}
}

Expand All @@ -91,7 +91,7 @@ pub enum ParserDefinitionNode {
Labeled<Box<Self>>,
Box<Self>,
Labeled<Box<Self>>,
DelimitedRecoveryOpts,
DelimitedRecoveryTokenThreshold,
),
SeparatedBy(Labeled<Box<Self>>, Labeled<Box<Self>>),
TerminatedBy(Box<Self>, Labeled<Box<Self>>),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,15 @@ impl ParseInputTokens for usize {
}
}

impl ParseInputTokens for u8 {
fn parse_value(input: ParseStream<'_>, _: &mut ErrorsCollection) -> Result<Self> {
let literal = ParseHelpers::syn::<syn::LitInt>(input)?;
let value = literal.base10_parse::<u8>()?;

Ok(value)
}
}

impl<T: ParseInputTokens> ParseInputTokens for Vec<T> {
fn parse_value(input: ParseStream<'_>, errors: &mut ErrorsCollection) -> Result<Self> {
Ok(ParseHelpers::sequence(input, errors))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,16 @@ impl WriteOutputTokens for usize {
}
}

impl WriteOutputTokens for u8 {
fn write_output_tokens(&self) -> TokenStream {
let value = Literal::u8_suffixed(*self);

quote! {
#value.into()
}
}
}

impl<T: WriteOutputTokens> WriteOutputTokens for Vec<T> {
fn write_output_tokens(&self) -> TokenStream {
let items = self.iter().map(T::write_output_tokens);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@ pub struct FieldsErrorRecovery {
pub struct FieldDelimiters {
pub open: Identifier,
pub close: Identifier,
/// Whether completely unmatched body between the delimiters should
/// prevent the the error recovery from being applied.
///
/// This is generally safe but sometimes needs to be disabled if the
/// recovery would lead to a misparse in case of ambiguous input.
pub disallow_unmatched_body: Option<bool>,
/// How many tokens have to be matched to trigger the error recovery.
/// For ambiguous syntaxes this needs to be set to at least N, where N
/// is the token lookahead required to disambiguate the syntax.
pub tokens_matched_acceptance_threshold: Option<u8>,
}

#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ fn get_spanned_type(input: Type) -> Type {
}

// External types should also be wrapped in 'Spanned<T>':
"bool" | "char" | "PathBuf" | "String" | "Version" => {
"bool" | "u8" | "char" | "PathBuf" | "String" | "Version" => {
parse_quote! {
crate::internals::Spanned<#input>
}
Expand Down
13 changes: 5 additions & 8 deletions crates/codegen/parser/generator/src/parser_definition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
quote! { self.#function_name(input) }
}

Self::DelimitedBy(open, body, close, opts) => {
Self::DelimitedBy(open, body, close, threshold) => {
let open_label = format_ident!("{}", open.label.to_pascal_case());
let close_label = format_ident!("{}", close.label.to_pascal_case());
let [open_delim, close_delim] = match (open.as_ref(), close.as_ref()) {
Expand All @@ -201,11 +201,7 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
) => [open, close].map(|scanner| format_ident!("{}", scanner.name())),
_ => unreachable!("Only tokens are permitted as delimiters"),
};
let recover = if opts.disallow_unmatched_body {
quote! { RecoverFromNoMatch::No }
} else {
quote! { RecoverFromNoMatch::Yes }
};
let threshold = threshold.0;

let parser = body.to_parser_code(context_name, is_trivia);
let body_parser = body.applicable_version_quality_ranges().wrap_code(
Expand All @@ -214,7 +210,7 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
.recover_until_with_nested_delims::<_, #lex_ctx>(input,
self,
TokenKind::#close_delim,
#recover,
TokenAcceptanceThreshold(#threshold),
)
)?;
},
Expand Down Expand Up @@ -280,7 +276,8 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
.recover_until_with_nested_delims::<_, #lex_ctx>(input,
self,
TokenKind::#terminator,
RecoverFromNoMatch::No,
// Requires at least a partial match not to risk misparsing
TokenAcceptanceThreshold(1u8),
)
)?;
},
Expand Down
2 changes: 1 addition & 1 deletion crates/codegen/parser/runtime/src/parser_support/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pub(crate) use parser_result::ParserResult;
#[allow(unused_imports)]
pub(crate) use precedence_helper::PrecedenceHelper;
#[allow(unused_imports)]
pub(crate) use recovery::RecoverFromNoMatch;
pub(crate) use recovery::TokenAcceptanceThreshold;
#[allow(unused_imports)]
pub(crate) use repetition_helper::{OneOrMoreHelper, ZeroOrMoreHelper};
#[allow(unused_imports)]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::ops::ControlFlow;

use crate::cst::{self, LabeledNode};
use crate::cst::{self, LabeledNode, Node};
use crate::kinds::{NodeLabel, RuleKind, TokenKind};
use crate::text_index::TextIndex;

Expand Down Expand Up @@ -201,6 +201,33 @@ impl IncompleteMatch {
expected_tokens,
}
}

/// Whether this prefix-matched at least `n` (non-skipped) tokens.
pub fn matches_at_least_n_tokens(&self, n: u8) -> bool {
let result = self
.nodes
.iter()
.flat_map(|node| node.cursor_with_offset(TextIndex::ZERO))
.try_fold(0u8, |mut acc, node| {
match node {
Node::Token(tok) if tok.kind != TokenKind::SKIPPED => {
acc += 1;
}
_ => {}
}

// Short-circuit not to walk the whole tree if we've already matched enough
if acc >= n {
ControlFlow::Break(acc)
} else {
ControlFlow::Continue(acc)
}
});

match result {
ControlFlow::Continue(value) | ControlFlow::Break(value) => value >= n,
}
}
}

#[derive(PartialEq, Eq, Clone, Debug)]
Expand Down
33 changes: 15 additions & 18 deletions crates/codegen/parser/runtime/src/parser_support/recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,11 @@ use crate::parser_support::parser_result::SkippedUntil;
use crate::parser_support::ParserResult;
use crate::text_index::{TextRange, TextRangeExtensions as _};

/// An explicit parameter for the [`ParserResult::recover_until_with_nested_delims`] method.
/// How many tokens have to be matched to trigger the error recovery.
/// For ambiguous syntaxes this needs to be set to at least N, where N
/// is the token lookahead required to disambiguate the syntax.
#[derive(Clone, Copy)]
pub(crate) enum RecoverFromNoMatch {
Yes,
No,
}

impl RecoverFromNoMatch {
pub fn as_bool(self) -> bool {
matches!(self, RecoverFromNoMatch::Yes)
}
}
pub(crate) struct TokenAcceptanceThreshold(pub(crate) u8);

fn opt_parse(
input: &mut ParserContext<'_>,
Expand Down Expand Up @@ -46,7 +39,7 @@ impl ParserResult {
input: &mut ParserContext<'_>,
lexer: &L,
expected: TokenKind,
recover_from_no_match: RecoverFromNoMatch,
acceptance_threshold: TokenAcceptanceThreshold,
) -> ParserResult {
enum ParseResultKind {
Match,
Expand All @@ -57,11 +50,15 @@ impl ParserResult {
let before_recovery = input.position();

let (mut nodes, mut expected_tokens, result_kind) = match self {
ParserResult::IncompleteMatch(result) => (
result.nodes,
result.expected_tokens,
ParseResultKind::Incomplete,
),
ParserResult::IncompleteMatch(result)
if result.matches_at_least_n_tokens(acceptance_threshold.0) =>
{
(
result.nodes,
result.expected_tokens,
ParseResultKind::Incomplete,
)
}
ParserResult::Match(result)
if lexer
.peek_token_with_trivia::<LexCtx>(input)
Expand All @@ -70,7 +67,7 @@ impl ParserResult {
{
(result.nodes, result.expected_tokens, ParseResultKind::Match)
}
ParserResult::NoMatch(result) if recover_from_no_match.as_bool() => {
ParserResult::NoMatch(result) if acceptance_threshold.0 == 0 => {
(vec![], result.expected_tokens, ParseResultKind::NoMatch)
}
// No need to recover, so just return as-is.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::napi_interface::parse_output::ParseOutput as NAPIParseOutput;
use crate::parse_output::ParseOutput;
use crate::parser_support::{
ChoiceHelper, OneOrMoreHelper, OptionalHelper, ParserContext, ParserFunction, ParserResult,
PrecedenceHelper, RecoverFromNoMatch, SeparatedHelper, SequenceHelper, ZeroOrMoreHelper,
PrecedenceHelper, SeparatedHelper, SequenceHelper, TokenAcceptanceThreshold, ZeroOrMoreHelper,
};

#[derive(Debug)]
Expand Down
8 changes: 4 additions & 4 deletions crates/solidity/inputs/language/src/definition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3349,10 +3349,10 @@ codegen_language_macros::compile!(Language(
open = open_brace,
close = close_brace,
// NOTE: Despite `NamedArguments` requiring at least one element,
// we need to disable attempting to recover from empty elements,
// because this postfix is ambiguous with `try <EXPR> {} catch {}`
// and could lead to incorrect parsing if we recover past valid syntax.
disallow_unmatched_body = true
// we can only recover if we found at least two tokens (`ident:`)
// in the body, as this may be otherwise ambiguous with
// `try <EXPR> { func() } catch {}`.
tokens_matched_acceptance_threshold = 2
)
),
fields = (
Expand Down
Loading

0 comments on commit 71f3d1b

Please sign in to comment.