Skip to content

Commit

Permalink
Bring back keyword lookup using trie
Browse files Browse the repository at this point in the history
  • Loading branch information
Xanewok committed Dec 27, 2023
1 parent 1c82302 commit b9077aa
Show file tree
Hide file tree
Showing 7 changed files with 11,683 additions and 13,251 deletions.
41 changes: 41 additions & 0 deletions crates/codegen/grammar/src/scanner_definition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,44 @@ impl From<KeywordScannerDefinitionNode> for ScannerDefinitionNode {
}
}
}

/// A [`KeywordScannerDefinitionRef`] that only has a single atom value.
#[derive(Clone)]
pub struct KeywordScannerAtomic(KeywordScannerDefinitionRef);

impl KeywordScannerAtomic {
/// Returns `Some` if the definition is a single atom value, `None` otherwise.
pub fn try_from_def(def: &KeywordScannerDefinitionRef) -> Option<Self> {
match def.definitions() {
[KeywordScannerDefinitionVersionedNode {
value: KeywordScannerDefinitionNode::Atom(_),
..
}] => Some(Self(def.clone())),
_ => None,
}
}
}

impl std::ops::Deref for KeywordScannerAtomic {
type Target = KeywordScannerDefinitionRef;

fn deref(&self) -> &Self::Target {
&self.0
}
}

impl KeywordScannerAtomic {
pub fn definition(&self) -> &KeywordScannerDefinitionVersionedNode {
let def = &self.0.definitions().get(0);
def.expect("KeywordScannerAtomic should have exactly one definition")
}
pub fn value(&self) -> &str {
match self.definition() {
KeywordScannerDefinitionVersionedNode {
value: KeywordScannerDefinitionNode::Atom(atom),
..
} => atom,
_ => unreachable!("KeywordScannerAtomic should have a single atom value"),
}
}
}
50 changes: 32 additions & 18 deletions crates/codegen/parser/generator/src/code_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ use std::path::Path;

use anyhow::Result;
use codegen_grammar::{
Grammar, GrammarVisitor, KeywordScannerDefinitionRef, ParserDefinitionNode,
ParserDefinitionRef, PrecedenceParserDefinitionRef, ScannerDefinitionNode,
ScannerDefinitionRef, TriviaParserDefinitionRef,
Grammar, GrammarVisitor, KeywordScannerAtomic, KeywordScannerDefinitionRef,
ParserDefinitionNode, ParserDefinitionRef, PrecedenceParserDefinitionRef,
ScannerDefinitionNode, ScannerDefinitionRef, TriviaParserDefinitionRef,
};
use infra_utils::cargo::CargoWorkspace;
use infra_utils::codegen::Codegen;
Expand All @@ -31,7 +31,7 @@ pub struct CodeGenerator {
scanner_functions: BTreeMap<&'static str, String>, // (name of scanner, code)
scanner_contexts: BTreeMap<&'static str, ScannerContext>,
// All of the keyword scanners (for now we assume we don't have context-specific keywords)
keyword_scanners: BTreeMap<&'static str, String>,
keyword_compound_scanners: BTreeMap<&'static str, String>,

parser_functions: BTreeMap<&'static str, String>, // (name of parser, code)

Expand All @@ -48,7 +48,10 @@ struct ScannerContext {
#[serde(skip)]
scanner_definitions: BTreeSet<&'static str>,
literal_scanner: String,
keyword_scanners: BTreeMap<&'static str, (&'static str, String)>,
keyword_compound_scanners: BTreeMap<&'static str, (&'static str, String)>,
keyword_trie_scanner: String,
#[serde(skip)]
keyword_scanner_defs: BTreeMap<&'static str, KeywordScannerDefinitionRef>,
identifier_scanners: BTreeSet<&'static str>,
compound_scanner_names: Vec<&'static str>,
delimiters: BTreeMap<&'static str, &'static str>,
Expand Down Expand Up @@ -215,21 +218,36 @@ impl GrammarVisitor for CodeGenerator {
context.literal_scanner = literal_trie.to_scanner_code().to_string();

context.identifier_scanners = context
.keyword_scanners
.iter()
.map(|(_, (ident_scanner, _))| *ident_scanner)
.keyword_scanner_defs
.values()
.map(|def| def.identifier_scanner())
.collect();

let mut keyword_trie = Trie::new();
for (name, def) in &context.keyword_scanner_defs {
match KeywordScannerAtomic::try_from_def(def) {
Some(atomic) => keyword_trie.insert(atomic.value(), atomic.clone()),
None => {
context.keyword_compound_scanners.insert(
name,
(def.identifier_scanner(), def.to_scanner_code().to_string()),
);
}
}
}

context.keyword_trie_scanner = keyword_trie.to_scanner_code().to_string();
}

// Collect all of the keyword scanners into a single list to be defined at top-level
self.keyword_scanners = self
self.keyword_compound_scanners = self
.scanner_contexts
.values()
.flat_map(|context| {
context
.keyword_scanners
.keyword_compound_scanners
.iter()
.map(|(name, (_, code))| (*name, code.to_string()))
.map(|(name, (_, code))| (*name, code.clone()))
})
.collect();

Expand Down Expand Up @@ -334,13 +352,9 @@ impl GrammarVisitor for CodeGenerator {
self.token_kinds.insert(scanner.name());

// Assume we don't have context-specific keywords for now
self.current_context().keyword_scanners.insert(
scanner.name(),
(
scanner.identifier_scanner(),
scanner.to_scanner_code().to_string(),
),
);
self.current_context()
.keyword_scanner_defs
.insert(scanner.name(), scanner.clone());
}
// Collect delimiters for each context
ParserDefinitionNode::DelimitedBy(open, _, close) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ impl KeywordScannerDefinitionExtensions for KeywordScannerDefinitionRef {
})
.collect();

quote! {
scan_keyword_choice!(input, #(#kw_scanners),*)
match &kw_scanners[..] {
[] => quote! { KeywordScan::Absent },
[scanner] => scanner.clone(),
multiple => quote! { scan_keyword_choice!(input, #(#multiple),*) },
}
}
}
Expand Down
91 changes: 64 additions & 27 deletions crates/codegen/parser/generator/src/trie.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
use std::collections::BTreeMap;
use std::fmt::Debug;

use codegen_grammar::{ScannerDefinitionNode, ScannerDefinitionRef, VersionQualityRange};
use codegen_grammar::{
KeywordScannerAtomic, KeywordScannerDefinitionVersionedNode, ScannerDefinitionNode,
ScannerDefinitionRef, VersionQualityRange,
};
use proc_macro2::TokenStream;
use quote::{format_ident, quote};

use crate::parser_definition::VersionQualityRangeVecExtensions;

#[derive(Clone, Debug, Default)]
pub struct Trie {
pub struct Trie<T: Payload> {
pub subtries: BTreeMap<char, Self>,
pub key: Option<String>,
pub payload: Option<ScannerDefinitionRef>,
pub payload: Option<T>,
}

impl Trie {
impl<T: Payload> Trie<T> {
pub fn new() -> Self {
Self {
subtries: BTreeMap::new(),
Expand All @@ -23,7 +26,7 @@ impl Trie {
}
}

pub fn insert(&mut self, key: &str, payload: ScannerDefinitionRef) {
pub fn insert(&mut self, key: &str, payload: T) {
let mut node = self;
for char in key.chars() {
node = node.subtries.entry(char).or_insert_with(Self::new);
Expand All @@ -34,7 +37,7 @@ impl Trie {

// Finds the next node that has either a payload or more than one subtrie
// It returns the path to that node and the node itself
pub fn next_interesting_node(&self, prefix: Option<char>) -> (Vec<char>, &Trie) {
pub fn next_interesting_node(&self, prefix: Option<char>) -> (Vec<char>, &Self) {
let mut path = prefix.map(|c| vec![c]).unwrap_or_default();
let mut node = self;
while node.payload.is_none() && node.subtries.len() == 1 {
Expand All @@ -57,26 +60,10 @@ impl Trie {
})
.collect::<Vec<_>>();

let leaf = if let Some(scanner_definition_ref) = &trie.payload {
let kind = format_ident!("{}", scanner_definition_ref.name());

if branches.is_empty() && !path.is_empty() {
// This is an optimisation for a common case
let leaf = quote! { scan_chars!(input, #(#path),*).then_some(TokenKind::#kind) };

return scanner_definition_ref
.node()
.applicable_version_quality_ranges()
.wrap_code(leaf, Some(quote! { None }));
}

scanner_definition_ref
.node()
.applicable_version_quality_ranges()
.wrap_code(quote! { Some(TokenKind::#kind) }, Some(quote! { None }))
} else {
quote! { None }
};
let leaf = trie
.payload
.as_ref()
.map_or_else(T::default_case, T::to_leaf_code);

let trie_code = if branches.is_empty() {
leaf
Expand All @@ -90,14 +77,15 @@ impl Trie {
}
};

let default_case = T::default_case();
if path.is_empty() {
trie_code
} else {
quote! {
if scan_chars!(input, #(#path),*) {
#trie_code
} else {
None
#default_case
}
}
}
Expand All @@ -123,3 +111,52 @@ impl VersionWrapped for ScannerDefinitionNode {
}
}
}

pub trait Payload {
fn to_leaf_code(&self) -> TokenStream;
fn default_case() -> TokenStream;
}

impl Payload for ScannerDefinitionRef {
fn to_leaf_code(&self) -> TokenStream {
let kind = format_ident!("{}", self.name());

self.node().applicable_version_quality_ranges().wrap_code(
quote! { Some(TokenKind::#kind) },
Some(Self::default_case()),
)
}

fn default_case() -> TokenStream {
quote! { None }
}
}

impl Payload for KeywordScannerAtomic {
fn to_leaf_code(&self) -> TokenStream {
let kind = format_ident!("{}", self.name());

let KeywordScannerDefinitionVersionedNode {
enabled, reserved, ..
} = self.definition();

let enabled_cond = enabled.as_bool_expr();
let reserved_cond = reserved.as_bool_expr();

// TODO: Simplify generated code if we trivially know that reserved or enabled is true
quote! {
// Optimize to only attempt scanning if it's enabled or reserved; the (bool) checks are trivial
if #reserved_cond {
Some((KeywordScan::Reserved, TokenKind::#kind))
} else if #enabled_cond {
Some((KeywordScan::Present, TokenKind::#kind))
} else {
None
}
}
}

fn default_case() -> TokenStream {
quote! { None }
}
}
24 changes: 18 additions & 6 deletions crates/codegen/parser/runtime/src/templates/language.rs.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,10 @@ impl Language {
{% endfor %}

// Keyword scanners
{%- for keyword_name, code in code.keyword_scanners %}
{%- for keyword_name, code in code.keyword_compound_scanners %}
#[allow(clippy::ifs_same_cond, clippy::eq_op, clippy::nonminimal_bool, clippy::overly_complex_bool_expr)]
#[allow(clippy::wrong_self_convention)] // from_keyword refers to a "from" keyword
#[inline]
fn {{ keyword_name | snake_case }}(&self, input: &mut ParserContext<'_>) -> KeywordScan { {{ code }} }
{%- endfor %}

Expand Down Expand Up @@ -196,26 +197,37 @@ impl Lexer for Language {
longest_match! {
{%- for name in context.compound_scanner_names %}
{%- if name not in context.identifier_scanners %}
{ {{name }} = {{ name | snake_case }} }
{ {{name }} = {{ name | snake_case }} }
{%- endif -%}
{%- endfor %}
}
// Make sure keyword identifiers are last so they don't grab other things
// Make sure promotable identifiers are last so they don't grab other things
longest_match! {
{%- for name in context.identifier_scanners %}
{ {{ name }} = {{ name | snake_case }} }
{ {{ name }} = {{ name | snake_case }} }
{%- endfor %}
}

// Attempt keyword promotion if it was lexed as an identifier
// Attempt keyword promotion if possible
if longest_tokens.iter().any(|tok| [
{% for ident_scanner in context.identifier_scanners %}
TokenKind::{{ ident_scanner }},
{% endfor %}
].contains(tok))
{
// Try fast path for atomic keywords
if let Some((scan, kind)) = {{ context.keyword_trie_scanner }} {
match scan {
_ if input.position() < furthest_position => {/* Prefix, do nothing */},
KeywordScan::Reserved => longest_tokens = vec![kind],
KeywordScan::Present => longest_tokens.push(kind),
KeywordScan::Absent => unreachable!(),
}
}
input.set_position(save);

longest_keyword_match! {
{%- for keyword_name, code in context.keyword_scanners %}
{%- for keyword_name, code in context.keyword_compound_scanners %}
{ {{ keyword_name }} = {{ keyword_name | snake_case }} }
{%- endfor %}
}
Expand Down
Loading

0 comments on commit b9077aa

Please sign in to comment.