From 489c9f7606ffe2c8e4aac2885b79e4937d3332e1 Mon Sep 17 00:00:00 2001
From: John Ed Quinn <johqunn@amazon.com>
Date: Thu, 17 Nov 2022 16:50:01 -0800
Subject: [PATCH] Adds non-reserved keywords

---
 partiql-parser/src/lexer.rs              |  72 ++++++++-
 partiql-parser/src/parse/mod.rs          |  49 ++++++
 partiql-parser/src/parse/partiql.lalrpop |  30 ++++
 partiql-parser/src/preprocessor.rs       | 184 ++++++++++++++++++++---
 4 files changed, 312 insertions(+), 23 deletions(-)
diff --git a/partiql-parser/src/lexer.rs b/partiql-parser/src/lexer.rs
index 8785f755..849ad5cc 100644
--- a/partiql-parser/src/lexer.rs
+++ b/partiql-parser/src/lexer.rs
@@ -510,6 +510,8 @@ pub enum Token<'input> {
     Ion(&'input str),
 
     // Keywords
+    #[regex("(?i:Acyclic)")]
+    Acyclic,
     #[regex("(?i:All)")]
     All,
     #[regex("(?i:Asc)")]
@@ -522,6 +524,8 @@ pub enum Token<'input> {
     At,
     #[regex("(?i:Between)")]
     Between,
+    #[regex("(?i:Both)")]
+    Both,
     #[regex("(?i:By)")]
     By,
     #[regex("(?i:Case)")]
@@ -534,6 +538,8 @@ pub enum Token<'input> {
     Desc,
     #[regex("(?i:Distinct)")]
     Distinct,
+    #[regex("(?i:Domain)")]
+    Domain,
     #[regex("(?i:Else)")]
     Else,
     #[regex("(?i:End)")]
@@ -570,6 +576,8 @@ pub enum Token<'input> {
     Last,
     #[regex("(?i:Lateral)")]
     Lateral,
+    #[regex("(?i:Leading)")]
+    Leading,
     #[regex("(?i:Left)")]
     Left,
     #[regex("(?i:Like)")]
@@ -602,10 +610,14 @@ pub enum Token<'input> {
     Pivot,
     #[regex("(?i:Preserve)")]
     Preserve,
+    #[regex("(?i:Public)")]
+    Public,
     #[regex("(?i:Right)")]
     Right,
     #[regex("(?i:Select)")]
     Select,
+    #[regex("(?i:Simple)")]
+    Simple,
     #[regex("(?i:Table)")]
     Table,
     #[regex("(?i:Time)")]
@@ -614,12 +626,18 @@ pub enum Token<'input> {
     Timestamp,
     #[regex("(?i:Then)")]
     Then,
+    #[regex("(?i:Trail)")]
+    Trail,
+    #[regex("(?i:Trailing)")]
+    Trailing,
     #[regex("(?i:True)")]
     True,
     #[regex("(?i:Union)")]
     Union,
     #[regex("(?i:Unpivot)")]
     Unpivot,
+    #[regex("(?i:User)")]
+    User,
     #[regex("(?i:Using)")]
     Using,
     #[regex("(?i:Value)")]
@@ -642,17 +660,20 @@ impl<'input> Token<'input> {
     pub fn is_keyword(&self) -> bool {
         matches!(
             self,
-            Token::All
+            Token::Acyclic
+                | Token::All
                 | Token::Asc
                 | Token::And
                 | Token::As
                 | Token::At
                 | Token::Between
+                | Token::Both
                 | Token::By
                 | Token::Cross
                 | Token::Date
                 | Token::Desc
                 | Token::Distinct
+                | Token::Domain
                 | Token::Escape
                 | Token::Except
                 | Token::First
@@ -668,6 +689,7 @@ impl<'input> Token<'input> {
                 | Token::Join
                 | Token::Last
                 | Token::Lateral
+                | Token::Leading
                 | Token::Left
                 | Token::Like
                 | Token::Limit
@@ -684,14 +706,19 @@ impl<'input> Token<'input> {
                 | Token::Partial
                 | Token::Pivot
                 | Token::Preserve
+                | Token::Public
                 | Token::Right
                 | Token::Select
+                | Token::Simple
                 | Token::Table
                 | Token::Time
                 | Token::Timestamp
                 | Token::Then
+                | Token::Trail
+                | Token::Trailing
                 | Token::Union
                 | Token::Unpivot
+                | Token::User
                 | Token::Using
                 | Token::Value
                 | Token::Values
@@ -748,18 +775,21 @@ impl<'input> fmt::Display for Token<'input> {
             Token::EmbeddedIonQuote => write!(f, "<ION>"),
             Token::Ion(txt) => write!(f, "<{}:ION>", txt),
 
-            Token::All
+            Token::Acyclic
+            | Token::All
             | Token::Asc
             | Token::And
             | Token::As
             | Token::At
             | Token::Between
+            | Token::Both
             | Token::By
             | Token::Case
             | Token::Cross
             | Token::Date
             | Token::Desc
             | Token::Distinct
+            | Token::Domain
             | Token::Else
             | Token::End
             | Token::Escape
@@ -778,6 +808,7 @@ impl<'input> fmt::Display for Token<'input> {
             | Token::Join
             | Token::Last
             | Token::Lateral
+            | Token::Leading
             | Token::Left
             | Token::Like
             | Token::Limit
@@ -794,15 +825,20 @@ impl<'input> fmt::Display for Token<'input> {
             | Token::Partial
             | Token::Pivot
             | Token::Preserve
+            | Token::Public
             | Token::Right
             | Token::Select
+            | Token::Simple
             | Token::Table
             | Token::Time
             | Token::Timestamp
             | Token::Then
+            | Token::Trail
+            | Token::Trailing
             | Token::True
             | Token::Union
             | Token::Unpivot
+            | Token::User
             | Token::Using
             | Token::Value
             | Token::Values
@@ -1107,6 +1143,38 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn select_non_reserved_keywords() -> Result<(), ParseError<'static, BytePosition>> {
+        let query = "SELECT acyclic, BoTh, DOMAIN, SImple, Trail, TRailing, USER\nfrom @\"foo\"";
+        let mut offset_tracker = LineOffsetTracker::default();
+        let lexer = PartiqlLexer::new(query, &mut offset_tracker);
+        let toks: Vec<_> = lexer.collect::<Result<_, _>>()?;
+
+        assert_eq!(
+            vec![
+                Token::Select,
+                Token::Acyclic,
+                Token::Comma,
+                Token::Both,
+                Token::Comma,
+                Token::Domain,
+                Token::Comma,
+                Token::Simple,
+                Token::Comma,
+                Token::Trail,
+                Token::Comma,
+                Token::Trailing,
+                Token::Comma,
+                Token::User,
+                Token::From,
+                Token::QuotedAtIdentifier("foo"),
+            ],
+            toks.into_iter().map(|(_s, t, _e)| t).collect::<Vec<_>>()
+        );
+        assert_eq!(offset_tracker.num_lines(), 2);
+        Ok(())
+    }
+
     #[test]
     fn select_comment_block() -> Result<(), ParseError<'static, BytePosition>> {
         let query = "SELECT /*comment*/ g";
diff --git a/partiql-parser/src/parse/mod.rs b/partiql-parser/src/parse/mod.rs
index d34f4ddf..001de961 100644
--- a/partiql-parser/src/parse/mod.rs
+++ b/partiql-parser/src/parse/mod.rs
@@ -679,6 +679,55 @@ mod tests {
         }
     }
 
+    // PROOF OF CONCEPT: NON-RESERVED KEYWORDS
+    mod non_reserved {
+        use super::*;
+
+        #[test]
+        fn projection_list_trim_spec() {
+            parse!(r#"SELECT leading FROM t"#);
+            parse!(r#"SELECT leading, a FROM t"#);
+            parse!(r#"SELECT leading + trailing, b FROM t"#);
+            parse!(r#"SELECT both + leading + trailing, a, b, c FROM t"#);
+        }
+
+        #[test]
+        fn from_source() {
+            parse!(r#"SELECT leading, trailing, both FROM leading, trailing, both"#);
+        }
+
+        #[test]
+        fn with_trim() {
+            parse!(
+                r#"SELECT leading + trim(leading leading FROM '  hello world'), both FROM leading, trailing, both"#
+            );
+        }
+
+        #[test]
+        fn with_order() {
+            parse!(r#"SELECT order FROM t ORDER BY order + 5"#);
+            parse!(r#"SELECT order FROM order ORDER BY order + 5"#);
+            parse!(r#"SELECT ORDER FROM ORDER ORDER BY ORDER + 5"#);
+        }
+
+        #[test]
+        fn with_gpml() {
+            parse!(r#"SELECT acyclic, trail, simple FROM t"#);
+            parse!(r#"AcYcLiC"#);
+            parse!(r#"TrAiL"#);
+            parse!(r#"SiMpLe"#);
+        }
+
+        //
+        #[test]
+        fn external_customer_request() {
+            parse!(r#"SELECT user, puBlIC, DOMAIN FROM USER, pUbLIc, domain"#);
+            parse!(r#"USER"#);
+            parse!(r#"pUbLIC"#);
+            parse!(r#"domain"#);
+        }
+    }
+
     mod errors {
         use super::*;
         use crate::error::{LexError, UnexpectedToken, UnexpectedTokenData};
diff --git a/partiql-parser/src/parse/partiql.lalrpop b/partiql-parser/src/parse/partiql.lalrpop
index e31489b1..e983ff68 100644
--- a/partiql-parser/src/parse/partiql.lalrpop
+++ b/partiql-parser/src/parse/partiql.lalrpop
@@ -1025,6 +1025,27 @@ VarRefExpr: ast::Expr = {
             name: ast::SymbolPrimitive { value: ident.to_owned(), case: ast::CaseSensitivity::CaseSensitive },
             qualifier: ast::ScopeQualifier::Unqualified
         },lo..hi)),
+    <lo:@L> <ident:NonReservedKeyword> <hi:@R> => ast::Expr::VarRef(state.node(ast::VarRef {
+            name: ast::SymbolPrimitive { value: ident.to_owned(), case: ast::CaseSensitivity::CaseInsensitive },
+            qualifier: ast::ScopeQualifier::Unqualified
+        },lo..hi)),
+}
+
+// PROOF OF CONCEPT
+// These are all the proposed non-reserved keywords (except ORDER)
+// ORDER is only in this list to prove that even ORDER can be parsed as a non-reserved keyword.
+#[inline]
+NonReservedKeyword: &'static str = {
+    "ACYCLIC" => "ACYCLIC",
+    "BOTH" => "BOTH",
+    "DOMAIN" => "DOMAIN",
+    "LEADING" => "LEADING",
+    "ORDER" => "ORDER",
+    "PUBLIC" => "PUBLIC",
+    "SIMPLE" => "SIMPLE",
+    "TRAIL" => "TRAIL",
+    "TRAILING" => "TRAILING",
+    "USER" => "USER",
 }
 
 // ------------------------------------------------------------------------------ //
@@ -1267,18 +1288,21 @@ extern {
         "Ion" => lexer::Token::Ion(<&'input str>),
 
         // Keywords
+        "ACYCLIC" => lexer::Token::Acyclic,
         "ALL" => lexer::Token::All,
         "ASC" => lexer::Token::Asc,
         "AND" => lexer::Token::And,
         "AS" => lexer::Token::As,
         "AT" => lexer::Token::At,
         "BETWEEN" => lexer::Token::Between,
+        "BOTH" => lexer::Token::Both,
         "BY" => lexer::Token::By,
         "CASE" => lexer::Token::Case,
         "CROSS" => lexer::Token::Cross,
         "DATE" => lexer::Token::Date,
         "DESC" => lexer::Token::Desc,
         "DISTINCT" => lexer::Token::Distinct,
+        "DOMAIN" => lexer::Token::Domain,
         "ELSE" => lexer::Token::Else,
         "END" => lexer::Token::End,
         "ESCAPE" => lexer::Token::Escape,
@@ -1297,6 +1321,7 @@ extern {
         "JOIN" => lexer::Token::Join,
         "LAST" => lexer::Token::Last,
         "LATERAL" => lexer::Token::Lateral,
+        "LEADING" => lexer::Token::Leading,
         "LEFT" => lexer::Token::Left,
         "LIKE" => lexer::Token::Like,
         "LIMIT" => lexer::Token::Limit,
@@ -1313,15 +1338,20 @@ extern {
         "PARTIAL" => lexer::Token::Partial,
         "PIVOT" => lexer::Token::Pivot,
         "PRESERVE" => lexer::Token::Preserve,
+        "PUBLIC" => lexer::Token::Public,
         "RIGHT" => lexer::Token::Right,
         "SELECT" => lexer::Token::Select,
+        "SIMPLE" => lexer::Token::Simple,
         "TABLE" => lexer::Token::Table,
         "TIME" => lexer::Token::Time,
         "TIMESTAMP" => lexer::Token::Timestamp,
         "THEN" => lexer::Token::Then,
+        "TRAIL" => lexer::Token::Trail,
+        "TRAILING" => lexer::Token::Trailing,
         "TRUE" => lexer::Token::True,
         "UNION" => lexer::Token::Union,
         "UNPIVOT" => lexer::Token::Unpivot,
+        "USER" => lexer::Token::User,
         "USING" => lexer::Token::Using,
         "VALUE" => lexer::Token::Value,
         "VALUES" => lexer::Token::Values,
diff --git a/partiql-parser/src/preprocessor.rs b/partiql-parser/src/preprocessor.rs
index 9e31ba3a..8e4791dd 100644
--- a/partiql-parser/src/preprocessor.rs
+++ b/partiql-parser/src/preprocessor.rs
@@ -60,26 +60,128 @@ mod built_ins {
     use regex::Regex;
 
     use FnExprArgMatch::{
-        AnyOne, AnyZeroOrMore as AnyStar, NamedArgId as Id, NamedArgKw as Kw, Synthesize as Syn,
+        AnyOne, AnyZeroOrMore as AnyStar, Match, NamedArgId as Id, NamedArgKw as Kw,
+        Synthesize as Syn,
     };
 
-    const TRIM_SPECIFIER: &str = "(?i:leading)|(?i:trailing)|(?i:both)";
+    /// PROOF OF CONCEPT EXAMPLE
+    /// Creates the combinations of regular expressions to parse TRIM.
+    /// Allows for using LEADING/TRAILING/BOTH as both identifiers and keywords.
+    /// Note: This isn't meant to be efficient.
+    fn create_trim_regular_expression_combinations<'a>() -> Vec<Vec<FnExprArgMatch<'a>>> {
+        let outer_tokens = [Token::Leading, Token::Trailing, Token::Both];
+        let mid_tokens = [Token::Leading, Token::Trailing, Token::Both];
+        let inner_tokens = [Token::Leading, Token::Trailing, Token::Both];
+        let mut regular_expressions: Vec<Vec<FnExprArgMatch>> = Vec::new();
+
+        // 1st, 2nd, and 3rd: trim(trailing both from leading) => trim("trailing": both, "from": leading)
+        for outer_token in outer_tokens.iter() {
+            for mid_token in mid_tokens.iter() {
+                for last_token in inner_tokens.iter() {
+                    let expression = vec![
+                        Kw(outer_token.clone()),
+                        Match(mid_token.clone()),
+                        AnyStar(false),
+                        Kw(Token::From),
+                        Match(last_token.clone()),
+                        AnyStar(false),
+                    ];
+                    regular_expressions.push(expression);
+                }
+            }
+        }
+
+        // 1st and 3rd: trim(trailing 'hello' from leading) => trim("trailing": 'hello', "from": leading)
+        // 1st and 2nd: trim(trailing leading from x) => trim("trailing": leading, "from": x)
+        for outer_token in outer_tokens.iter() {
+            for mid_token in mid_tokens.iter() {
+                let expression = vec![
+                    Kw(outer_token.clone()),
+                    AnyOne(false),
+                    AnyStar(false),
+                    Kw(Token::From),
+                    Match(mid_token.clone()),
+                    AnyStar(false),
+                ];
+                let expression1 = vec![
+                    Kw(outer_token.clone()),
+                    Match(mid_token.clone()),
+                    AnyStar(false),
+                    Kw(Token::From),
+                    AnyOne(false),
+                    AnyStar(false),
+                ];
+                regular_expressions.push(expression);
+                regular_expressions.push(expression1);
+            }
+        }
+
+        // 1st only: trim(trailing 'hello' from x) => trim("trailing": 'hello', "from": x)
+        // 3rd only: trim('hello' from leading) => trim('hello', "from": leading)
+        for outer_token in outer_tokens {
+            let expression = vec![
+                Kw(outer_token.clone()),
+                AnyOne(false),
+                AnyStar(false),
+                Kw(Token::From),
+                AnyOne(false),
+                AnyStar(false),
+            ];
+
+            let expression_01 = vec![
+                AnyOne(false),
+                AnyStar(false),
+                Kw(Token::From),
+                Match(outer_token.clone()),
+                AnyStar(false),
+            ];
+            regular_expressions.push(expression);
+            regular_expressions.push(expression_01);
+        }
+        regular_expressions
+    }
 
     pub(crate) fn built_in_trim() -> FnExpr<'static> {
-        let re = Regex::new(TRIM_SPECIFIER).unwrap();
+        let mut permutations = create_trim_regular_expression_combinations();
+        let mut patterns = vec![
+            // e.g., trim(trailing from x) => trim("trailing": ' ', "from": x)
+            vec![
+                Kw(Token::Leading),
+                Syn(Token::String(" ")),
+                Kw(Token::From),
+                AnyOne(false),
+                AnyStar(false),
+            ],
+            vec![
+                Kw(Token::Trailing),
+                Syn(Token::String(" ")),
+                Kw(Token::From),
+                AnyOne(false),
+                AnyStar(false),
+            ],
+            vec![
+                Kw(Token::Both),
+                Syn(Token::String(" ")),
+                Kw(Token::From),
+                AnyOne(false),
+                AnyStar(false),
+            ],
+            // e.g., trim(' ' from x) => trim(' ', "from": x)
+            vec![
+                AnyOne(false),
+                AnyStar(false),
+                Kw(Token::From),
+                AnyOne(false),
+                AnyStar(false),
+            ],
+            // e.g., trim(from x) => trim("from": x)
+            vec![Kw(Token::From), AnyOne(false), AnyStar(false)],
+        ];
+        permutations.append(&mut patterns);
         FnExpr {
             fn_names: vec!["trim"],
             #[rustfmt::skip]
-            patterns: vec![
-                // e.g., trim(leading 'tt' from x) => trim("leading": 'tt', "from": x)
-                vec![Id(re.clone()), AnyOne(false), AnyStar(false), Kw(Token::From), AnyOne(false), AnyStar(false)],
-                // e.g., trim(trailing from x) => trim("trailing": ' ', "from": x)
-                vec![Id(re), Syn(Token::String(" ")), Kw(Token::From), AnyOne(false), AnyStar(false)],
-                // e.g., trim(' ' from x) => trim(' ', "from": x)
-                vec![AnyOne(false), AnyStar(false), Kw(Token::From), AnyOne(false), AnyStar(false)],
-                // e.g., trim(from x) => trim("from": x)
-                vec![Kw(Token::From), AnyOne(false), AnyStar(false)],
-            ],
+            patterns: permutations,
         }
     }
 
@@ -640,7 +742,7 @@ mod tests {
             vec![
                 Token::UnquotedIdent("trim"),
                 Token::OpenParen,
-                Token::UnquotedIdent("LEADING"),
+                Token::QuotedIdent("LEADING"),
                 Token::Colon,
                 Token::String("Foo"),
                 Token::Comma,
@@ -704,31 +806,71 @@ mod tests {
 
         assert_eq!(
             preprocess(r#"trim(LEADING 'Foo' from 'FooBar')"#)?,
-            lex(r#"trim(LEADING : 'Foo', "from" : 'FooBar')"#)?
+            lex(r#"trim("LEADING" : 'Foo', "from" : 'FooBar')"#)?
+        );
+
+        // Trim Specification in all 3 spots
+        assert_eq!(
+            preprocess(r#"trim(BOTH TrAiLiNg from TRAILING)"#)?,
+            lex(r#"trim("BOTH" : TrAiLiNg, "from" : TRAILING)"#)?
+        );
+
+        // Trim specification in 1st and 2nd spot
+        assert_eq!(
+            preprocess(r#"trim(LEADING LEADING from 'FooBar')"#)?,
+            lex(r#"trim("LEADING" : LEADING, "from" : 'FooBar')"#)?
+        );
+        assert_eq!(
+            preprocess(r#"trim(LEADING TrAiLiNg from 'FooBar')"#)?,
+            lex(r#"trim("LEADING" : TrAiLiNg, "from" : 'FooBar')"#)?
+        );
+        assert_eq!(
+            preprocess(r#"trim(tRaIlInG TrAiLiNg from 'FooBar')"#)?,
+            lex(r#"trim("tRaIlInG" : TrAiLiNg, "from" : 'FooBar')"#)?
         );
+
+        // Trim Specification in 1st and 3rd spot
         assert_eq!(
             preprocess(r#"trim(LEADING 'Foo' from leaDing)"#)?,
-            lex(r#"trim(LEADING : 'Foo', "from" : leaDing)"#)?
+            lex(r#"trim("LEADING" : 'Foo', "from" : leaDing)"#)?
+        );
+
+        // Trim Specification (quoted) in 2nd and 3rd spot
+        assert_eq!(
+            preprocess(r#"trim('LEADING' from leaDing)"#)?,
+            lex(r#"trim('LEADING', "from" : leaDing)"#)?
         );
+
+        // Trim Specification in 3rd spot only
+        assert_eq!(
+            preprocess(r#"trim('a' from leaDing)"#)?,
+            lex(r#"trim('a', "from" : leaDing)"#)?
+        );
+
+        assert_eq!(
+            preprocess(r#"trim(LEADING a from b)"#)?,
+            lex(r#"trim("LEADING" : a, "from" : b)"#)?
+        );
+
         assert_eq!(
             preprocess(r#"trim(leading from '   Bar')"#)?,
-            lex(r#"trim(leading : ' ',  "from" : '   Bar')"#)?
+            lex(r#"trim("leading" : ' ',  "from" : '   Bar')"#)?
         );
         assert_eq!(
             preprocess(r#"trim(TrAiLiNg 'Bar' from 'FooBar')"#)?,
-            lex(r#"trim(TrAiLiNg : 'Bar',  "from" : 'FooBar')"#)?
+            lex(r#"trim("TrAiLiNg" : 'Bar',  "from" : 'FooBar')"#)?
         );
         assert_eq!(
             preprocess(r#"trim(TRAILING from 'Bar   ')"#)?,
-            lex(r#"trim(TRAILING: ' ', "from": 'Bar   ')"#)?
+            lex(r#"trim("TRAILING": ' ', "from": 'Bar   ')"#)?
         );
         assert_eq!(
             preprocess(r#"trim(BOTH 'Foo' from 'FooBarBar')"#)?,
-            lex(r#"trim(BOTH: 'Foo', "from": 'FooBarBar')"#)?
+            lex(r#"trim("BOTH": 'Foo', "from": 'FooBarBar')"#)?
         );
         assert_eq!(
             preprocess(r#"trim(botH from '   Bar   ')"#)?,
-            lex(r#"trim(botH: ' ', "from": '   Bar   ')"#)?
+            lex(r#"trim("botH": ' ', "from": '   Bar   ')"#)?
         );
         assert_eq!(
             preprocess(r#"trim(from '   Bar   ')"#)?,