From 489c9f7606ffe2c8e4aac2885b79e4937d3332e1 Mon Sep 17 00:00:00 2001 From: John Ed Quinn Date: Thu, 17 Nov 2022 16:50:01 -0800 Subject: [PATCH] Adds non-reserved keywords --- partiql-parser/src/lexer.rs | 72 ++++++++- partiql-parser/src/parse/mod.rs | 49 ++++++ partiql-parser/src/parse/partiql.lalrpop | 30 ++++ partiql-parser/src/preprocessor.rs | 184 ++++++++++++++++++++--- 4 files changed, 312 insertions(+), 23 deletions(-) diff --git a/partiql-parser/src/lexer.rs b/partiql-parser/src/lexer.rs index 8785f755..849ad5cc 100644 --- a/partiql-parser/src/lexer.rs +++ b/partiql-parser/src/lexer.rs @@ -510,6 +510,8 @@ pub enum Token<'input> { Ion(&'input str), // Keywords + #[regex("(?i:Acyclic)")] + Acyclic, #[regex("(?i:All)")] All, #[regex("(?i:Asc)")] @@ -522,6 +524,8 @@ pub enum Token<'input> { At, #[regex("(?i:Between)")] Between, + #[regex("(?i:Both)")] + Both, #[regex("(?i:By)")] By, #[regex("(?i:Case)")] @@ -534,6 +538,8 @@ pub enum Token<'input> { Desc, #[regex("(?i:Distinct)")] Distinct, + #[regex("(?i:Domain)")] + Domain, #[regex("(?i:Else)")] Else, #[regex("(?i:End)")] @@ -570,6 +576,8 @@ pub enum Token<'input> { Last, #[regex("(?i:Lateral)")] Lateral, + #[regex("(?i:Leading)")] + Leading, #[regex("(?i:Left)")] Left, #[regex("(?i:Like)")] @@ -602,10 +610,14 @@ pub enum Token<'input> { Pivot, #[regex("(?i:Preserve)")] Preserve, + #[regex("(?i:Public)")] + Public, #[regex("(?i:Right)")] Right, #[regex("(?i:Select)")] Select, + #[regex("(?i:Simple)")] + Simple, #[regex("(?i:Table)")] Table, #[regex("(?i:Time)")] @@ -614,12 +626,18 @@ pub enum Token<'input> { Timestamp, #[regex("(?i:Then)")] Then, + #[regex("(?i:Trail)")] + Trail, + #[regex("(?i:Trailing)")] + Trailing, #[regex("(?i:True)")] True, #[regex("(?i:Union)")] Union, #[regex("(?i:Unpivot)")] Unpivot, + #[regex("(?i:User)")] + User, #[regex("(?i:Using)")] Using, #[regex("(?i:Value)")] @@ -642,17 +660,20 @@ impl<'input> Token<'input> { pub fn is_keyword(&self) -> bool { matches!( self, - Token::All + Token::Acyclic + | Token::All | Token::Asc | Token::And | Token::As | Token::At | Token::Between + | Token::Both | Token::By | Token::Cross | Token::Date | Token::Desc | Token::Distinct + | Token::Domain | Token::Escape | Token::Except | Token::First @@ -668,6 +689,7 @@ impl<'input> Token<'input> { | Token::Join | Token::Last | Token::Lateral + | Token::Leading | Token::Left | Token::Like | Token::Limit @@ -684,14 +706,19 @@ impl<'input> Token<'input> { | Token::Partial | Token::Pivot | Token::Preserve + | Token::Public | Token::Right | Token::Select + | Token::Simple | Token::Table | Token::Time | Token::Timestamp | Token::Then + | Token::Trail + | Token::Trailing | Token::Union | Token::Unpivot + | Token::User | Token::Using | Token::Value | Token::Values @@ -748,18 +775,21 @@ impl<'input> fmt::Display for Token<'input> { Token::EmbeddedIonQuote => write!(f, ""), Token::Ion(txt) => write!(f, "<{}:ION>", txt), - Token::All + Token::Acyclic + | Token::All | Token::Asc | Token::And | Token::As | Token::At | Token::Between + | Token::Both | Token::By | Token::Case | Token::Cross | Token::Date | Token::Desc | Token::Distinct + | Token::Domain | Token::Else | Token::End | Token::Escape @@ -778,6 +808,7 @@ impl<'input> fmt::Display for Token<'input> { | Token::Join | Token::Last | Token::Lateral + | Token::Leading | Token::Left | Token::Like | Token::Limit @@ -794,15 +825,20 @@ impl<'input> fmt::Display for Token<'input> { | Token::Partial | Token::Pivot | Token::Preserve + | Token::Public | Token::Right | Token::Select + | Token::Simple | Token::Table | Token::Time | Token::Timestamp | Token::Then + | Token::Trail + | Token::Trailing | Token::True | Token::Union | Token::Unpivot + | Token::User | Token::Using | Token::Value | Token::Values @@ -1107,6 +1143,38 @@ mod tests { Ok(()) } + #[test] + fn select_non_reserved_keywords() -> Result<(), ParseError<'static, BytePosition>> { + let query = "SELECT acyclic, BoTh, DOMAIN, SImple, Trail, TRailing, USER\nfrom @\"foo\""; + let mut offset_tracker = LineOffsetTracker::default(); + let lexer = PartiqlLexer::new(query, &mut offset_tracker); + let toks: Vec<_> = lexer.collect::>()?; + + assert_eq!( + vec![ + Token::Select, + Token::Acyclic, + Token::Comma, + Token::Both, + Token::Comma, + Token::Domain, + Token::Comma, + Token::Simple, + Token::Comma, + Token::Trail, + Token::Comma, + Token::Trailing, + Token::Comma, + Token::User, + Token::From, + Token::QuotedAtIdentifier("foo"), + ], + toks.into_iter().map(|(_s, t, _e)| t).collect::>() + ); + assert_eq!(offset_tracker.num_lines(), 2); + Ok(()) + } + #[test] fn select_comment_block() -> Result<(), ParseError<'static, BytePosition>> { let query = "SELECT /*comment*/ g"; diff --git a/partiql-parser/src/parse/mod.rs b/partiql-parser/src/parse/mod.rs index d34f4ddf..001de961 100644 --- a/partiql-parser/src/parse/mod.rs +++ b/partiql-parser/src/parse/mod.rs @@ -679,6 +679,55 @@ mod tests { } } + // PROOF OF CONCEPT: NON-RESERVED KEYWORDS + mod non_reserved { + use super::*; + + #[test] + fn projection_list_trim_spec() { + parse!(r#"SELECT leading FROM t"#); + parse!(r#"SELECT leading, a FROM t"#); + parse!(r#"SELECT leading + trailing, b FROM t"#); + parse!(r#"SELECT both + leading + trailing, a, b, c FROM t"#); + } + + #[test] + fn from_source() { + parse!(r#"SELECT leading, trailing, both FROM leading, trailing, both"#); + } + + #[test] + fn with_trim() { + parse!( + r#"SELECT leading + trim(leading leading FROM ' hello world'), both FROM leading, trailing, both"# + ); + } + + #[test] + fn with_order() { + parse!(r#"SELECT order FROM t ORDER BY order + 5"#); + parse!(r#"SELECT order FROM order ORDER BY order + 5"#); + parse!(r#"SELECT ORDER FROM ORDER ORDER BY ORDER + 5"#); + } + + #[test] + fn with_gpml() { + parse!(r#"SELECT acyclic, trail, simple FROM t"#); + parse!(r#"AcYcLiC"#); + parse!(r#"TrAiL"#); + parse!(r#"SiMpLe"#); + } + + // + #[test] + fn external_customer_request() { + parse!(r#"SELECT user, puBlIC, DOMAIN FROM USER, pUbLIc, domain"#); + parse!(r#"USER"#); + parse!(r#"pUbLIC"#); + parse!(r#"domain"#); + } + } + mod errors { use super::*; use crate::error::{LexError, UnexpectedToken, UnexpectedTokenData}; diff --git a/partiql-parser/src/parse/partiql.lalrpop b/partiql-parser/src/parse/partiql.lalrpop index e31489b1..e983ff68 100644 --- a/partiql-parser/src/parse/partiql.lalrpop +++ b/partiql-parser/src/parse/partiql.lalrpop @@ -1025,6 +1025,27 @@ VarRefExpr: ast::Expr = { name: ast::SymbolPrimitive { value: ident.to_owned(), case: ast::CaseSensitivity::CaseSensitive }, qualifier: ast::ScopeQualifier::Unqualified },lo..hi)), + => ast::Expr::VarRef(state.node(ast::VarRef { + name: ast::SymbolPrimitive { value: ident.to_owned(), case: ast::CaseSensitivity::CaseInsensitive }, + qualifier: ast::ScopeQualifier::Unqualified + },lo..hi)), +} + +// PROOF OF CONCEPT +// These are all the proposed non-reserved keywords (except ORDER) +// ORDER is only in this list to prove that even ORDER can be parsed as a non-reserved keyword. +#[inline] +NonReservedKeyword: &'static str = { + "ACYCLIC" => "ACYCLIC", + "BOTH" => "BOTH", + "DOMAIN" => "DOMAIN", + "LEADING" => "LEADING", + "ORDER" => "ORDER", + "PUBLIC" => "PUBLIC", + "SIMPLE" => "SIMPLE", + "TRAIL" => "TRAIL", + "TRAILING" => "TRAILING", + "USER" => "USER", } // ------------------------------------------------------------------------------ // @@ -1267,18 +1288,21 @@ extern { "Ion" => lexer::Token::Ion(<&'input str>), // Keywords + "ACYCLIC" => lexer::Token::Acyclic, "ALL" => lexer::Token::All, "ASC" => lexer::Token::Asc, "AND" => lexer::Token::And, "AS" => lexer::Token::As, "AT" => lexer::Token::At, "BETWEEN" => lexer::Token::Between, + "BOTH" => lexer::Token::Both, "BY" => lexer::Token::By, "CASE" => lexer::Token::Case, "CROSS" => lexer::Token::Cross, "DATE" => lexer::Token::Date, "DESC" => lexer::Token::Desc, "DISTINCT" => lexer::Token::Distinct, + "DOMAIN" => lexer::Token::Domain, "ELSE" => lexer::Token::Else, "END" => lexer::Token::End, "ESCAPE" => lexer::Token::Escape, @@ -1297,6 +1321,7 @@ extern { "JOIN" => lexer::Token::Join, "LAST" => lexer::Token::Last, "LATERAL" => lexer::Token::Lateral, + "LEADING" => lexer::Token::Leading, "LEFT" => lexer::Token::Left, "LIKE" => lexer::Token::Like, "LIMIT" => lexer::Token::Limit, @@ -1313,15 +1338,20 @@ extern { "PARTIAL" => lexer::Token::Partial, "PIVOT" => lexer::Token::Pivot, "PRESERVE" => lexer::Token::Preserve, + "PUBLIC" => lexer::Token::Public, "RIGHT" => lexer::Token::Right, "SELECT" => lexer::Token::Select, + "SIMPLE" => lexer::Token::Simple, "TABLE" => lexer::Token::Table, "TIME" => lexer::Token::Time, "TIMESTAMP" => lexer::Token::Timestamp, "THEN" => lexer::Token::Then, + "TRAIL" => lexer::Token::Trail, + "TRAILING" => lexer::Token::Trailing, "TRUE" => lexer::Token::True, "UNION" => lexer::Token::Union, "UNPIVOT" => lexer::Token::Unpivot, + "USER" => lexer::Token::User, "USING" => lexer::Token::Using, "VALUE" => lexer::Token::Value, "VALUES" => lexer::Token::Values, diff --git a/partiql-parser/src/preprocessor.rs b/partiql-parser/src/preprocessor.rs index 9e31ba3a..8e4791dd 100644 --- a/partiql-parser/src/preprocessor.rs +++ b/partiql-parser/src/preprocessor.rs @@ -60,26 +60,128 @@ mod built_ins { use regex::Regex; use FnExprArgMatch::{ - AnyOne, AnyZeroOrMore as AnyStar, NamedArgId as Id, NamedArgKw as Kw, Synthesize as Syn, + AnyOne, AnyZeroOrMore as AnyStar, Match, NamedArgId as Id, NamedArgKw as Kw, + Synthesize as Syn, }; - const TRIM_SPECIFIER: &str = "(?i:leading)|(?i:trailing)|(?i:both)"; + /// PROOF OF CONCEPT EXAMPLE + /// Creates the combinations of regular expressions to parse TRIM. + /// Allows for using LEADING/TRAILING/BOTH as both identifiers and keywords. + /// Note: This isn't meant to be efficient. + fn create_trim_regular_expression_combinations<'a>() -> Vec>> { + let outer_tokens = [Token::Leading, Token::Trailing, Token::Both]; + let mid_tokens = [Token::Leading, Token::Trailing, Token::Both]; + let inner_tokens = [Token::Leading, Token::Trailing, Token::Both]; + let mut regular_expressions: Vec> = Vec::new(); + + // 1st, 2nd, and 3rd: trim(trailing both from leading) => trim("trailing": both, "from": leading) + for outer_token in outer_tokens.iter() { + for mid_token in mid_tokens.iter() { + for last_token in inner_tokens.iter() { + let expression = vec![ + Kw(outer_token.clone()), + Match(mid_token.clone()), + AnyStar(false), + Kw(Token::From), + Match(last_token.clone()), + AnyStar(false), + ]; + regular_expressions.push(expression); + } + } + } + + // 1st and 3rd: trim(trailing 'hello' from leading) => trim("trailing": 'hello', "from": leading) + // 1st and 2nd: trim(trailing leading from x) => trim("trailing": leading, "from": x) + for outer_token in outer_tokens.iter() { + for mid_token in mid_tokens.iter() { + let expression = vec![ + Kw(outer_token.clone()), + AnyOne(false), + AnyStar(false), + Kw(Token::From), + Match(mid_token.clone()), + AnyStar(false), + ]; + let expression1 = vec![ + Kw(outer_token.clone()), + Match(mid_token.clone()), + AnyStar(false), + Kw(Token::From), + AnyOne(false), + AnyStar(false), + ]; + regular_expressions.push(expression); + regular_expressions.push(expression1); + } + } + + // 1st only: trim(trailing 'hello' from x) => trim("trailing": 'hello', "from": x) + // 3rd only: trim('hello' from leading) => trim('hello', "from": leading) + for outer_token in outer_tokens { + let expression = vec![ + Kw(outer_token.clone()), + AnyOne(false), + AnyStar(false), + Kw(Token::From), + AnyOne(false), + AnyStar(false), + ]; + + let expression_01 = vec![ + AnyOne(false), + AnyStar(false), + Kw(Token::From), + Match(outer_token.clone()), + AnyStar(false), + ]; + regular_expressions.push(expression); + regular_expressions.push(expression_01); + } + regular_expressions + } pub(crate) fn built_in_trim() -> FnExpr<'static> { - let re = Regex::new(TRIM_SPECIFIER).unwrap(); + let mut permutations = create_trim_regular_expression_combinations(); + let mut patterns = vec![ + // e.g., trim(trailing from x) => trim("trailing": ' ', "from": x) + vec![ + Kw(Token::Leading), + Syn(Token::String(" ")), + Kw(Token::From), + AnyOne(false), + AnyStar(false), + ], + vec![ + Kw(Token::Trailing), + Syn(Token::String(" ")), + Kw(Token::From), + AnyOne(false), + AnyStar(false), + ], + vec![ + Kw(Token::Both), + Syn(Token::String(" ")), + Kw(Token::From), + AnyOne(false), + AnyStar(false), + ], + // e.g., trim(' ' from x) => trim(' ', "from": x) + vec![ + AnyOne(false), + AnyStar(false), + Kw(Token::From), + AnyOne(false), + AnyStar(false), + ], + // e.g., trim(from x) => trim("from": x) + vec![Kw(Token::From), AnyOne(false), AnyStar(false)], + ]; + permutations.append(&mut patterns); FnExpr { fn_names: vec!["trim"], #[rustfmt::skip] - patterns: vec![ - // e.g., trim(leading 'tt' from x) => trim("leading": 'tt', "from": x) - vec![Id(re.clone()), AnyOne(false), AnyStar(false), Kw(Token::From), AnyOne(false), AnyStar(false)], - // e.g., trim(trailing from x) => trim("trailing": ' ', "from": x) - vec![Id(re), Syn(Token::String(" ")), Kw(Token::From), AnyOne(false), AnyStar(false)], - // e.g., trim(' ' from x) => trim(' ', "from": x) - vec![AnyOne(false), AnyStar(false), Kw(Token::From), AnyOne(false), AnyStar(false)], - // e.g., trim(from x) => trim("from": x) - vec![Kw(Token::From), AnyOne(false), AnyStar(false)], - ], + patterns: permutations, } } @@ -640,7 +742,7 @@ mod tests { vec![ Token::UnquotedIdent("trim"), Token::OpenParen, - Token::UnquotedIdent("LEADING"), + Token::QuotedIdent("LEADING"), Token::Colon, Token::String("Foo"), Token::Comma, @@ -704,31 +806,71 @@ mod tests { assert_eq!( preprocess(r#"trim(LEADING 'Foo' from 'FooBar')"#)?, - lex(r#"trim(LEADING : 'Foo', "from" : 'FooBar')"#)? + lex(r#"trim("LEADING" : 'Foo', "from" : 'FooBar')"#)? + ); + + // Trim Specification in all 3 spots + assert_eq!( + preprocess(r#"trim(BOTH TrAiLiNg from TRAILING)"#)?, + lex(r#"trim("BOTH" : TrAiLiNg, "from" : TRAILING)"#)? + ); + + // Trim specification in 1st and 2nd spot + assert_eq!( + preprocess(r#"trim(LEADING LEADING from 'FooBar')"#)?, + lex(r#"trim("LEADING" : LEADING, "from" : 'FooBar')"#)? + ); + assert_eq!( + preprocess(r#"trim(LEADING TrAiLiNg from 'FooBar')"#)?, + lex(r#"trim("LEADING" : TrAiLiNg, "from" : 'FooBar')"#)? + ); + assert_eq!( + preprocess(r#"trim(tRaIlInG TrAiLiNg from 'FooBar')"#)?, + lex(r#"trim("tRaIlInG" : TrAiLiNg, "from" : 'FooBar')"#)? ); + + // Trim Specification in 1st and 3rd spot assert_eq!( preprocess(r#"trim(LEADING 'Foo' from leaDing)"#)?, - lex(r#"trim(LEADING : 'Foo', "from" : leaDing)"#)? + lex(r#"trim("LEADING" : 'Foo', "from" : leaDing)"#)? + ); + + // Trim Specification (quoted) in 2nd and 3rd spot + assert_eq!( + preprocess(r#"trim('LEADING' from leaDing)"#)?, + lex(r#"trim('LEADING', "from" : leaDing)"#)? ); + + // Trim Specification in 3rd spot only + assert_eq!( + preprocess(r#"trim('a' from leaDing)"#)?, + lex(r#"trim('a', "from" : leaDing)"#)? + ); + + assert_eq!( + preprocess(r#"trim(LEADING a from b)"#)?, + lex(r#"trim("LEADING" : a, "from" : b)"#)? + ); + assert_eq!( preprocess(r#"trim(leading from ' Bar')"#)?, - lex(r#"trim(leading : ' ', "from" : ' Bar')"#)? + lex(r#"trim("leading" : ' ', "from" : ' Bar')"#)? ); assert_eq!( preprocess(r#"trim(TrAiLiNg 'Bar' from 'FooBar')"#)?, - lex(r#"trim(TrAiLiNg : 'Bar', "from" : 'FooBar')"#)? + lex(r#"trim("TrAiLiNg" : 'Bar', "from" : 'FooBar')"#)? ); assert_eq!( preprocess(r#"trim(TRAILING from 'Bar ')"#)?, - lex(r#"trim(TRAILING: ' ', "from": 'Bar ')"#)? + lex(r#"trim("TRAILING": ' ', "from": 'Bar ')"#)? ); assert_eq!( preprocess(r#"trim(BOTH 'Foo' from 'FooBarBar')"#)?, - lex(r#"trim(BOTH: 'Foo', "from": 'FooBarBar')"#)? + lex(r#"trim("BOTH": 'Foo', "from": 'FooBarBar')"#)? ); assert_eq!( preprocess(r#"trim(botH from ' Bar ')"#)?, - lex(r#"trim(botH: ' ', "from": ' Bar ')"#)? + lex(r#"trim("botH": ' ', "from": ' Bar ')"#)? ); assert_eq!( preprocess(r#"trim(from ' Bar ')"#)?,