Skip to content

Commit

Permalink
Remove ... ellipsis from query syntax and add explicit adjacency op…
Browse files Browse the repository at this point in the history
…erator (#1030)

This PR changes the CST query syntax by:

- Removing the `...` ellipsis operator and adding it implicitly at the
edges and between elements in a matching sequence. So that `[Foo [A]
[B]]` is equivalent to `[Foo ... [A] ... [B] ...]`.

- Adding the `.` adjacency (anchor) operator to explicitly indicate that
a matched node should be the first (eg. `[Foo . [A]]`) or last child
(eg. `[Foo [A] .]`), or that two matched nodes should be consecutive
(eg. `[Foo [A] . [B]]`).
The adjacency operator is allowed in sub-sequences of alternative
options or quantified sequences, but not at the beginning or end of the
pattern, where the adjacency is implicit. So:
`[Foo ([Bar] | [X] . [Y])]` is allowed, but `[Foo ([Bar] | . [X])]` is
not.

- Trivia kinds cannot be used in node matchers.

The PR also introduces some semantic changes to query execution:

- Trivia nodes are skipped over when executing a query and they cannot
be matched against.

- Only the first implicit ellipsis operator is allowed to match multiple
nodes in a sequence of siblings, unless there is an explicit node match
in between. This prevents the engine from returning duplicate results
when two ellipsis operators are effectively adjacent (for example when
they are separated by an optional matcher).

For example, given a sequence such as `ABCD`, the query `[_] ["B"]? [_]`
will operationally be equivalent to `[_] ... ["B"]? ... [_]`. With the
previous semantics, this would have returned 4 results, matching:
1. zero nodes for the first ellipsis, the optional matches the `B` and
the second ellipsis takes the `C`
2. zero nodes for the first ellipsis, zero nodes for the optional, and
the second ellipsis takes both `BC`
3. first ellipsis takes `B`, optional matches nothing, and second
ellipsis takes `C`
4. first ellipsis takes both `BC`, and optional and second ellipsis take
no nodes

After this PR, only two results are possible, corresponding to the cases
i. and iv. above, since the second ellipsis is allowed to match nodes
_only_ if the optional succeeds in matching at least some node.

The two returned results are distinct though, because the user may want
to capture the optional with the query `[_] @x ["B"]? [_]`. By
unification semantics the optional can match zero or one nodes, which is
consistent with the results obtained without capturing.

---------

Co-authored-by: Omar Tawfik <[email protected]>
  • Loading branch information
ggiraldez and OmarTawfik authored Aug 7, 2024
1 parent fa38d83 commit 7e467ce
Show file tree
Hide file tree
Showing 19 changed files with 1,127 additions and 500 deletions.
5 changes: 5 additions & 0 deletions .changeset/few-taxis-retire.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@nomicfoundation/slang": minor
---

Tree Query Language: queries now ignore trivia nodes.
5 changes: 5 additions & 0 deletions .changeset/gentle-shirts-deliver.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@nomicfoundation/slang": minor
---

Tree Query Language: remove the ellipsis query `...` operator making it implicit, add an adjacency operator `.`.
373 changes: 299 additions & 74 deletions crates/metaslang/cst/src/query/engine.rs

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions crates/metaslang/cst/src/query/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ impl<T: KindTypes> Query<T> {
capture_quantifiers,
)?;
}
ASTNode::Ellipsis => {}
ASTNode::Adjacency => {}
}
Ok(())
}
Expand Down Expand Up @@ -113,7 +113,7 @@ pub enum ASTNode<T: KindTypes> {
Alternatives(Rc<AlternativesASTNode<T>>),
Sequence(Rc<SequenceASTNode<T>>),
OneOrMore(Rc<OneOrMoreASTNode<T>>),
Ellipsis,
Adjacency,
}

impl<T: KindTypes> ASTNode<T> {
Expand Down Expand Up @@ -167,7 +167,7 @@ impl<T: KindTypes> fmt::Display for ASTNode<T> {
Self::OneOrMore(one_or_more) => {
write!(f, "({})+", one_or_more.child)
}
Self::Ellipsis => write!(f, "..."),
Self::Adjacency => write!(f, "."),
}
}
}
Expand Down Expand Up @@ -256,6 +256,10 @@ pub struct NodeMatchASTNode<T: KindTypes> {
#[derive(Debug)]
pub struct SequenceASTNode<T: KindTypes> {
pub children: Vec<ASTNode<T>>,
// By default sequences can match any number of nodes at the beginning and
// end of it. Setting this value to true prevents it and instead forces
// strict adjacency at the edges.
pub adjacent: bool,
}

#[derive(Debug)]
Expand Down
184 changes: 140 additions & 44 deletions crates/metaslang/cst/src/query/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ use std::fmt;
use std::rc::Rc;

use nom::branch::alt;
use nom::bytes::complete::{is_not, tag, take_while, take_while1, take_while_m_n};
use nom::character::complete::{char, multispace0, multispace1, satisfy};
use nom::bytes::complete::{is_not, take_while, take_while1, take_while_m_n};
use nom::character::complete::{char, multispace0, multispace1, none_of, satisfy};
use nom::combinator::{
all_consuming, cut, map_opt, map_res, opt, peek, recognize, success, value, verify,
all_consuming, cut, eof, map_opt, map_res, opt, peek, recognize, success, value, verify,
};
use nom::error::{ErrorKind, FromExternalError, ParseError};
use nom::multi::{fold_many0, many1, separated_list1};
Expand All @@ -19,7 +19,7 @@ use super::model::{
};
use crate::cst::NodeKind;
use crate::text_index::TextIndex;
use crate::{AbstractKind as _, KindTypes};
use crate::{AbstractKind as _, KindTypes, TerminalKind as _};

// ----------------------------------------------------------------------------
// Parse errors
Expand Down Expand Up @@ -47,10 +47,21 @@ enum QueryParserErrorKind {
Syntax(QuerySyntaxError),
}

#[derive(Clone)]
enum QuerySyntaxError {
EdgeLabel(String),
NodeKind(String),
EscapedUnicode,
DeprecatedEllipsis,
ForbiddenTriviaKind,
}

impl<I> QueryParserError<I> {
fn from_query_syntax_error(input: I, error: QuerySyntaxError) -> Self {
QueryParserError {
errors: vec![(input, QueryParserErrorKind::Syntax(error))],
}
}
}

impl<I> ParseError<I> for QueryParserError<I> {
Expand All @@ -74,9 +85,7 @@ impl<I> ParseError<I> for QueryParserError<I> {

impl<I> FromExternalError<I, QuerySyntaxError> for QueryParserError<I> {
fn from_external_error(input: I, _kind: ErrorKind, e: QuerySyntaxError) -> Self {
QueryParserError {
errors: vec![(input, QueryParserErrorKind::Syntax(e))],
}
Self::from_query_syntax_error(input, e)
}
}

Expand All @@ -88,6 +97,12 @@ impl fmt::Display for QuerySyntaxError {
QuerySyntaxError::EscapedUnicode => {
write!(f, "Invalid escaped Unicode character")
}
QuerySyntaxError::DeprecatedEllipsis => {
write!(f, "The ellipsis `...` operator is deprecated, and replaced with a new adjacency `.` operator. For more information, check the Tree Query Language guide: https://nomicfoundation.github.io/slang/user-guide/tree-query-language/")
}
QuerySyntaxError::ForbiddenTriviaKind => {
write!(f, "Matching trivia nodes directly is forbidden. For more information, check the Tree Query Language guide: https://nomicfoundation.github.io/slang/user-guide/tree-query-language/")
}
}
}
}
Expand Down Expand Up @@ -149,7 +164,7 @@ fn compute_row_and_column(target: &str, input: &str) -> TextIndex {
fn parse_matcher_alternatives<T: KindTypes>(
i: &str,
) -> IResult<&str, ASTNode<T>, QueryParserError<&str>> {
separated_list1(token('|'), parse_matcher_sequence::<T>)
separated_list1(token('|'), parse_matcher_alt_sequence::<T>)
.map(|mut children| {
if children.len() == 1 {
children.pop().unwrap()
Expand All @@ -163,38 +178,76 @@ fn parse_matcher_alternatives<T: KindTypes>(
fn parse_matcher_sequence<T: KindTypes>(
i: &str,
) -> IResult<&str, ASTNode<T>, QueryParserError<&str>> {
many1(parse_quantified_matcher::<T>)
.map(|mut children| {
if children.len() == 1 {
children.pop().unwrap()
} else {
ASTNode::Sequence(Rc::new(SequenceASTNode { children }))
}
})
.parse(i)
verify(
many1(parse_sequence_item::<T>),
|children: &[ASTNode<T>]| {
// It doesn't make sense for a sequence to be a single adjacency operator
children.len() > 1 || !matches!(children[0], ASTNode::Adjacency)
},
)
.map(|children| {
ASTNode::Sequence(Rc::new(SequenceASTNode {
children,
adjacent: false,
}))
})
.parse(i)
}

fn parse_quantified_matcher<T: KindTypes>(
fn parse_matcher_alt_sequence<T: KindTypes>(
i: &str,
) -> IResult<&str, ASTNode<T>, QueryParserError<&str>> {
verify(
many1(parse_sequence_item::<T>),
|children: &[ASTNode<T>]| {
// Alternative sequences cannot start or end with an adjacency
// operator, because it is implicitly adjacent to the previous and
// next matchers
!matches!(children[0], ASTNode::Adjacency)
&& !matches!(children[children.len() - 1], ASTNode::Adjacency)
},
)
.map(|mut children| {
if children.len() == 1 {
// Alternative sequences of length 1 can be simplified to the child pattern
children.pop().unwrap()
} else {
ASTNode::Sequence(Rc::new(SequenceASTNode {
children,
adjacent: true,
}))
}
})
.parse(i)
}

fn parse_sequence_item<T: KindTypes>(i: &str) -> IResult<&str, ASTNode<T>, QueryParserError<&str>> {
alt((
ellipsis_token.map(|_| ASTNode::Ellipsis), // Cannot be quantified
pair(
parse_bound_matcher,
parse_trailing_quantifier, // admits epsilon
)
.map(|(child, quantifier)| match quantifier {
CaptureQuantifier::ZeroOrOne => ASTNode::Optional(Rc::new(OptionalASTNode { child })),
CaptureQuantifier::ZeroOrMore => ASTNode::Optional(Rc::new(OptionalASTNode {
child: ASTNode::OneOrMore(Rc::new(OneOrMoreASTNode { child })),
})),
CaptureQuantifier::OneOrMore => ASTNode::OneOrMore(Rc::new(OneOrMoreASTNode { child })),
CaptureQuantifier::One => child,
}),
ellipsis_token,
adjacency_operator::<T>,
parse_quantified_matcher::<T>,
))
.parse(i)
}

fn parse_quantified_matcher<T: KindTypes>(
i: &str,
) -> IResult<&str, ASTNode<T>, QueryParserError<&str>> {
pair(
parse_bound_matcher,
parse_trailing_quantifier, // admits epsilon
)
.map(|(child, quantifier)| match quantifier {
CaptureQuantifier::ZeroOrOne => ASTNode::Optional(Rc::new(OptionalASTNode { child })),
CaptureQuantifier::ZeroOrMore => ASTNode::Optional(Rc::new(OptionalASTNode {
child: ASTNode::OneOrMore(Rc::new(OneOrMoreASTNode { child })),
})),
CaptureQuantifier::OneOrMore => ASTNode::OneOrMore(Rc::new(OneOrMoreASTNode { child })),
CaptureQuantifier::One => child,
})
.parse(i)
}

fn parse_bound_matcher<T: KindTypes>(i: &str) -> IResult<&str, ASTNode<T>, QueryParserError<&str>> {
pair(
opt(capture_name_token),
Expand Down Expand Up @@ -307,7 +360,10 @@ fn anonymous_selector<T: KindTypes>(
terminated(
terminated(
char('_'),
peek(satisfy(|c| c != '_' && !c.is_alphanumeric())),
peek(
eof.map(|_| ' ')
.or(satisfy(|c| c != '_' && !c.is_alphanumeric())),
),
),
multispace0,
)
Expand All @@ -319,14 +375,21 @@ fn kind_token<T: KindTypes>(i: &str) -> IResult<&str, NodeKind<T>, QueryParserEr
terminated(
preceded(
peek(satisfy(|c| c.is_alphabetic() || c == '_')),
cut(map_res(raw_identifier, |id| {
T::TerminalKind::try_from_str(id.as_str())
.map(NodeKind::Terminal)
.or_else(|_| {
T::NonterminalKind::try_from_str(id.as_str()).map(NodeKind::Nonterminal)
})
.or(Err(QuerySyntaxError::NodeKind(id)))
})),
cut(map_res(
raw_identifier,
|id| match T::TerminalKind::try_from_str(id.as_str()) {
Ok(kind) => {
if kind.is_trivia() {
Err(QuerySyntaxError::ForbiddenTriviaKind)
} else {
Ok(NodeKind::Terminal(kind))
}
}
Err(_) => T::NonterminalKind::try_from_str(id.as_str())
.map(NodeKind::Nonterminal)
.or(Err(QuerySyntaxError::NodeKind(id))),
},
)),
),
multispace0,
)
Expand Down Expand Up @@ -413,10 +476,43 @@ fn text_token(i: &str) -> IResult<&str, String, QueryParserError<&str>> {
.parse(i)
}

fn ellipsis_token(i: &str) -> IResult<&str, &str, QueryParserError<&str>> {
terminated(tag("..."), multispace0).parse(i)
}

fn token<'input>(c: char) -> impl Parser<&'input str, char, QueryParserError<&'input str>> {
terminated(char(c), multispace0)
}

fn adjacency_operator<T: KindTypes>(i: &str) -> IResult<&str, ASTNode<T>, QueryParserError<&str>> {
// An adjacency operator is a single '.' character, and cannot be followed
// by another adjacency operator
pair(token('.'), cut(peek(none_of(". \t\r\n"))))
.map(|_| ASTNode::Adjacency)
.parse(i)
}

fn recognize_as_failure<I: Clone, O1, O2, F>(
error: QuerySyntaxError,
mut parser: F,
) -> impl FnMut(I) -> IResult<I, O2, QueryParserError<I>>
where
F: nom::Parser<I, O1, QueryParserError<I>>,
{
use nom::Err::Failure;
move |input: I| {
let i = input.clone();
match parser.parse(i) {
Ok((_, _)) => Err(Failure(QueryParserError::from_query_syntax_error(
input,
error.clone(),
))),
Err(e) => Err(e),
}
}
}

fn ellipsis_token<O>(i: &str) -> IResult<&str, O, QueryParserError<&str>> {
use nom::bytes::complete::tag;
recognize_as_failure(
QuerySyntaxError::DeprecatedEllipsis,
terminated(tag("..."), multispace0),
)
.parse(i)
}
Loading

0 comments on commit 7e467ce

Please sign in to comment.