From 8c0a1a4fa68c7b484e81bc8709fb714de75b6b98 Mon Sep 17 00:00:00 2001 From: jinser Date: Thu, 21 Dec 2023 14:46:21 +0800 Subject: [PATCH] feat(expr): implement ` [ NOT ] SIMILAR TO [ ESCAPE ]` (#14000) Co-authored-by: Runji Wang --- .../functions/similar_to_escape.slt.part | 98 ++++++++ proto/expr.proto | 1 + src/expr/impl/src/scalar/mod.rs | 1 + src/expr/impl/src/scalar/similar_to_escape.rs | 224 ++++++++++++++++++ src/frontend/src/binder/expr/mod.rs | 41 ++++ src/frontend/src/expr/pure.rs | 1 + src/meta/src/controller/rename.rs | 12 + src/meta/src/manager/catalog/utils.rs | 12 + src/sqlparser/src/ast/mod.rs | 25 ++ src/sqlparser/src/parser.rs | 23 +- src/tests/regress/data/sql/strings.sql | 22 +- 11 files changed, 447 insertions(+), 13 deletions(-) create mode 100644 e2e_test/batch/functions/similar_to_escape.slt.part create mode 100644 src/expr/impl/src/scalar/similar_to_escape.rs diff --git a/e2e_test/batch/functions/similar_to_escape.slt.part b/e2e_test/batch/functions/similar_to_escape.slt.part new file mode 100644 index 000000000000..eec8ae2d292a --- /dev/null +++ b/e2e_test/batch/functions/similar_to_escape.slt.part @@ -0,0 +1,98 @@ +statement ok +create table t (id int, pat varchar, text varchar); + +# example data from https://www.postgresql.org/docs/16/functions-matching.html#FUNCTIONS-SIMILARTO-REGEXP +statement ok +insert into + t +values + (1, 'abc', 'abc'), + (2, 'a', 'abc'), + (3, '%(b|d)%', 'abc'), + (4, '(b|c)%', 'abc'), + (5, '%abc-%', '-abc-'), + (6, '%abc-%', 'xabcy'); + +query B +select (text similar to pat) from t order by id; +---- +t +f +t +f +t +f + +query B +select (text not similar to pat) from t order by id; +---- +f +t +f +t +f +t + +query T +select text from t where text similar to pat order by id; +---- +abc +abc +-abc- + +query T +select text from t where text not similar to pat order by id; +---- +abc +abc +xabcy + +query I +select count(1) from t where text similar to 'ab%'; +---- +4 + +query I +select count(1) from t where text not similar to 'ab%'; +---- +2 + +query B +select 'foobar' similar to '%#"o_b#"%'; +---- +f + +# default escape string +query B +select 'foobar' similar to '%#"o_b#"%' escape '\'; +---- +f + +# fallback to default escape string +query B +select 'foobar' similar to '%#"o_b#"%' escape ''; +---- +f + +query B +select 'foobar' similar to '%#"o_b#"%' escape '#'; +---- +t + +query B +select 'foobar' not similar to '%#"o_b#"%' escape '#'; +---- +f + +query B +select 'foobar' similar to '%🤡"o_b🤡"%' escape '🤡'; +---- +t + +query B +select 'foobar' not similar to '%👪"o_b👪"%' escape '👪'; +---- +f + +statement ok +drop table t; diff --git a/proto/expr.proto b/proto/expr.proto index 653cbeaa339a..c2285c411c89 100644 --- a/proto/expr.proto +++ b/proto/expr.proto @@ -73,6 +73,7 @@ message ExprNode { LENGTH = 203; LIKE = 204; I_LIKE = 279; + SIMILAR_TO_ESCAPE = 284; UPPER = 205; LOWER = 206; TRIM = 207; diff --git a/src/expr/impl/src/scalar/mod.rs b/src/expr/impl/src/scalar/mod.rs index 1d1f3e4bc150..dd1da7d88758 100644 --- a/src/expr/impl/src/scalar/mod.rs +++ b/src/expr/impl/src/scalar/mod.rs @@ -68,6 +68,7 @@ mod repeat; mod replace; mod round; mod sha; +mod similar_to_escape; mod split_part; mod string; mod string_to_array; diff --git a/src/expr/impl/src/scalar/similar_to_escape.rs b/src/expr/impl/src/scalar/similar_to_escape.rs new file mode 100644 index 000000000000..fc2a13a5eebd --- /dev/null +++ b/src/expr/impl/src/scalar/similar_to_escape.rs @@ -0,0 +1,224 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Write; + +use risingwave_expr::{function, ExprError, Result}; + +// escape `similar-to` pattern to POSIX regex pattern +// Adapted from: +// https://github.com/postgres/postgres/blob/REL_16_STABLE/src/backend/utils/adt/regexp.c#L768 +fn similar_escape_internal( + pat: &str, + esc_text: Option, + writer: &mut impl Write, +) -> std::result::Result<(), ExprError> { + macro_rules! write_ { + ($s:expr) => { + write!(writer, "{}", $s).unwrap() + }; + } + + write_!("^(?:"); + + let mut nquotes = 0; + let mut afterescape = false; + let mut incharclass = false; + + for chr in pat.chars() { + match chr { + c if afterescape => { + if c == '"' && !incharclass { + match nquotes { + 0 => write_!("){1,1}?("), + 1 => write_!("){1,1}(?:"), + _ => { + return Err(ExprError::InvalidParam { + name: "pat", + reason: "SQL regular expression may not contain more than two escape-double-quote separators".into() + }); + } + } + nquotes += 1; + } else { + write_!('\\'); + write_!(c); + } + + afterescape = false; + } + c if esc_text.is_some() && c == esc_text.unwrap() => { + afterescape = true; + } + c if incharclass => { + if c == '\\' { + write_!('\\'); + } + write_!(c); + + if c == ']' { + incharclass = false; + } + } + c @ '[' => { + write_!(c); + incharclass = true; + } + '%' => { + write_!(".*"); + } + '_' => { + write_!('.'); + } + '(' => { + // convert to non-capturing parenthesis + write_!("(?:"); + } + c @ ('\\' | '.' | '^' | '$') => { + write_!('\\'); + write_!(c); + } + c => { + write_!(c); + } + } + } + + write_!(")$"); + + Ok(()) +} + +#[function( + // x SIMILAR TO y -> x ~ similar_to_escape(y) + "similar_to_escape(varchar) -> varchar", +)] +fn similar_to_escape_default(pat: &str, writer: &mut impl Write) -> Result<()> { + similar_escape_internal(pat, Some('\\'), writer) +} + +#[function( + // x SIMILAR TO y ESCAPE z -> x ~ similar_to_escape(y, z) + "similar_to_escape(varchar, varchar) -> varchar" +)] +fn similar_to_escape_with_escape_text( + pat: &str, + esc_text: &str, + writer: &mut impl Write, +) -> Result<()> { + if esc_text.chars().nth(1).is_some() { + return Err(ExprError::InvalidParam { + name: "escape string", + reason: format!( + "Invalid escape string: `{}`, must be empty or one character", + esc_text + ) + .into(), + }); + } + + similar_escape_internal(pat, esc_text.chars().next(), writer) +} + +#[cfg(test)] +mod tests { + use super::{similar_to_escape_default, similar_to_escape_with_escape_text}; + + #[test] + fn test_default_escape() { + let cases = vec![ + ("", "^(?:)$"), + ("_bcd%", r#"^(?:.bcd.*)$"#), + ("bcd%", r#"^(?:bcd.*)$"#), + (r#"_bcd\%"#, r#"^(?:.bcd\%)$"#), + ("bcd[]ee", "^(?:bcd[]ee)$"), + (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#), + ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"), + ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#), + ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#), + (r#"%\"o_b\"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"), + ]; + + for (pat, escaped) in cases { + let mut writer = String::new(); + similar_to_escape_default(pat, &mut writer).ok(); + assert_eq!(writer, escaped); + } + + // may not contain more than two escape-double-quote separators + // 3 double quotes (> 2) + let pat = r#"one\"two\"three\"four"#; + let mut writer = String::new(); + let res = similar_to_escape_default(pat, &mut writer); + assert!(res.is_err()); + } + + #[test] + fn test_escape_with_escape_text() { + let cases = vec![ + ("", "^(?:)$"), + ("_bcd%", "^(?:.bcd.*)$"), + ("bcd%", "^(?:bcd.*)$"), + (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#), + ("bcd[]ee", "^(?:bcd[]ee)$"), + (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#), + (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#), + ("bcd[pp]ee", "^(?:bcd[pp]ee)$"), + ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"), + ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#), + ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#), + (r#"%#"o_b#"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"), + ]; + + for (pat, escaped) in cases { + let mut writer = String::new(); + similar_to_escape_with_escape_text(pat, "#", &mut writer).ok(); + assert_eq!(writer, escaped); + } + + let pat = "xxx"; + let mut writer = String::new(); + let res = similar_to_escape_with_escape_text(pat, "##", &mut writer); + assert!(res.is_err()) + } + + #[test] + fn test_escape_with_escape_unicode() { + let cases = vec![ + ("", "^(?:)$"), + ("_bcd%", "^(?:.bcd.*)$"), + ("bcd%", "^(?:bcd.*)$"), + (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#), + ("bcd[]ee", "^(?:bcd[]ee)$"), + (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#), + (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#), + ("bcd[pp]ee", "^(?:bcd[pp]ee)$"), + ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"), + ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#), + ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#), + (r#"%💅"o_b💅"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"), + ]; + + for (pat, escaped) in cases { + let mut writer = String::new(); + similar_to_escape_with_escape_text(pat, "💅", &mut writer).ok(); + assert_eq!(writer, escaped); + } + + let pat = "xxx"; + let mut writer = String::new(); + let res = similar_to_escape_with_escape_text(pat, "💅💅", &mut writer); + assert!(res.is_err()) + } +} diff --git a/src/frontend/src/binder/expr/mod.rs b/src/frontend/src/binder/expr/mod.rs index 2dfe1abbb26b..7ec7bcccd694 100644 --- a/src/frontend/src/binder/expr/mod.rs +++ b/src/frontend/src/binder/expr/mod.rs @@ -139,6 +139,12 @@ impl Binder { low, high, } => self.bind_between(*expr, negated, *low, *high), + Expr::SimilarTo { + expr, + negated, + pat, + esc_text, + } => self.bind_similar_to(*expr, negated, *pat, esc_text), Expr::InList { expr, list, @@ -413,6 +419,41 @@ impl Binder { Ok(func_call.into()) } + /// Bind ` [ NOT ] SIMILAR TO ESCAPE ` + pub(super) fn bind_similar_to( + &mut self, + expr: Expr, + negated: bool, + pat: Expr, + esc_text: Option>, + ) -> Result { + let expr = self.bind_expr_inner(expr)?; + let pat = self.bind_expr_inner(pat)?; + + let esc_inputs = if let Some(et) = esc_text { + let esc_text = self.bind_expr_inner(*et)?; + vec![pat, esc_text] + } else { + vec![pat] + }; + + let esc_call = + FunctionCall::new_unchecked(ExprType::SimilarToEscape, esc_inputs, DataType::Varchar); + + let regex_call = FunctionCall::new_unchecked( + ExprType::RegexpEq, + vec![expr, esc_call.into()], + DataType::Boolean, + ); + let func_call = if negated { + FunctionCall::new_unchecked(ExprType::Not, vec![regex_call.into()], DataType::Boolean) + } else { + regex_call + }; + + Ok(func_call.into()) + } + pub(super) fn bind_case( &mut self, operand: Option>, diff --git a/src/frontend/src/expr/pure.rs b/src/frontend/src/expr/pure.rs index 79ece8873160..0b1819a262f2 100644 --- a/src/frontend/src/expr/pure.rs +++ b/src/frontend/src/expr/pure.rs @@ -73,6 +73,7 @@ impl ExprVisitor for ImpureAnalyzer { | expr_node::Type::Length | expr_node::Type::Like | expr_node::Type::ILike + | expr_node::Type::SimilarToEscape | expr_node::Type::Upper | expr_node::Type::Lower | expr_node::Type::Trim diff --git a/src/meta/src/controller/rename.rs b/src/meta/src/controller/rename.rs index 254565efb391..91af881fbd5d 100644 --- a/src/meta/src/controller/rename.rs +++ b/src/meta/src/controller/rename.rs @@ -280,6 +280,18 @@ impl QueryRewriter<'_> { self.visit_expr(low); self.visit_expr(high); } + Expr::SimilarTo { + expr, + pat, + esc_text, + .. + } => { + self.visit_expr(expr); + self.visit_expr(pat); + if let Some(e) = esc_text { + self.visit_expr(e); + } + } Expr::IsDistinctFrom(expr1, expr2) | Expr::IsNotDistinctFrom(expr1, expr2) diff --git a/src/meta/src/manager/catalog/utils.rs b/src/meta/src/manager/catalog/utils.rs index ea579867fc32..1984d55c0f35 100644 --- a/src/meta/src/manager/catalog/utils.rs +++ b/src/meta/src/manager/catalog/utils.rs @@ -308,6 +308,18 @@ impl QueryRewriter<'_> { self.visit_expr(low); self.visit_expr(high); } + Expr::SimilarTo { + expr, + pat, + esc_text, + .. + } => { + self.visit_expr(expr); + self.visit_expr(pat); + if let Some(e) = esc_text { + self.visit_expr(e); + } + } Expr::IsDistinctFrom(expr1, expr2) | Expr::IsNotDistinctFrom(expr1, expr2) diff --git a/src/sqlparser/src/ast/mod.rs b/src/sqlparser/src/ast/mod.rs index d6063b8d4872..a6fa41bc6c0c 100644 --- a/src/sqlparser/src/ast/mod.rs +++ b/src/sqlparser/src/ast/mod.rs @@ -327,6 +327,13 @@ pub enum Expr { low: Box, high: Box, }, + /// ` [ NOT ] SIMILAR TO ESCAPE ` + SimilarTo { + expr: Box, + negated: bool, + pat: Box, + esc_text: Option>, + }, /// Binary operation e.g. `1 + 1` or `foo > bar` BinaryOp { left: Box, @@ -526,6 +533,24 @@ impl fmt::Display for Expr { low, high ), + Expr::SimilarTo { + expr, + negated, + pat, + esc_text, + } => { + write!( + f, + "{} {}SIMILAR TO {}", + expr, + if *negated { "NOT " } else { "" }, + pat, + )?; + if let Some(et) = esc_text { + write!(f, "ESCAPE {}", et)?; + } + Ok(()) + } Expr::BinaryOp { left, op, right } => write!(f, "{} {} {}", left, op, right), Expr::SomeOp(expr) => write!(f, "SOME({})", expr), Expr::AllOp(expr) => write!(f, "ALL({})", expr), diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs index 5d67bbb4a1a9..147463cefefc 100644 --- a/src/sqlparser/src/parser.rs +++ b/src/sqlparser/src/parser.rs @@ -1521,15 +1521,17 @@ impl Parser { self.expected("Expected Token::Word after AT", tok) } } - Keyword::NOT | Keyword::IN | Keyword::BETWEEN => { + Keyword::NOT | Keyword::IN | Keyword::BETWEEN | Keyword::SIMILAR => { self.prev_token(); let negated = self.parse_keyword(Keyword::NOT); if self.parse_keyword(Keyword::IN) { self.parse_in(expr, negated) } else if self.parse_keyword(Keyword::BETWEEN) { self.parse_between(expr, negated) + } else if self.parse_keywords(&[Keyword::SIMILAR, Keyword::TO]) { + self.parse_similar_to(expr, negated) } else { - self.expected("IN or BETWEEN after NOT", self.peek_token()) + self.expected("IN, BETWEEN or SIMILAR TO after NOT", self.peek_token()) } } // Can only happen if `get_next_precedence` got out of sync with this function @@ -1685,6 +1687,23 @@ impl Parser { }) } + /// Parses `SIMILAR TO [ ESCAPE ]` + pub fn parse_similar_to(&mut self, expr: Expr, negated: bool) -> Result { + let pat = self.parse_subexpr(Precedence::Between)?; + let esc_text = if self.parse_keyword(Keyword::ESCAPE) { + Some(Box::new(self.parse_subexpr(Precedence::Between)?)) + } else { + None + }; + + Ok(Expr::SimilarTo { + expr: Box::new(expr), + negated, + pat: Box::new(pat), + esc_text, + }) + } + /// Parse a postgresql casting style which is in the form of `expr::datatype` pub fn parse_pg_cast(&mut self, expr: Expr) -> Result { Ok(Expr::Cast { diff --git a/src/tests/regress/data/sql/strings.sql b/src/tests/regress/data/sql/strings.sql index b4b77ffaa15e..3bbba9fd0829 100644 --- a/src/tests/regress/data/sql/strings.sql +++ b/src/tests/regress/data/sql/strings.sql @@ -173,17 +173,17 @@ SELECT SUBSTRING('string' FROM -10 FOR -2147483646) AS "error"; --@ SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; -- Check behavior of SIMILAR TO, which uses largely the same regexp variant ---@ SELECT 'abcdefg' SIMILAR TO '_bcd%' AS true; ---@ SELECT 'abcdefg' SIMILAR TO 'bcd%' AS false; ---@ SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '#' AS false; ---@ SELECT 'abcd%' SIMILAR TO '_bcd#%' ESCAPE '#' AS true; ---@ -- Postgres uses '\' as the default escape character, which is not per spec ---@ SELECT 'abcdefg' SIMILAR TO '_bcd\%' AS false; ---@ -- and an empty string to mean "no escape", which is also not per spec ---@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; ---@ -- these behaviors are per spec, though: ---@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; ---@ SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; +SELECT 'abcdefg' SIMILAR TO '_bcd%' AS true; +SELECT 'abcdefg' SIMILAR TO 'bcd%' AS false; +SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '#' AS false; +SELECT 'abcd%' SIMILAR TO '_bcd#%' ESCAPE '#' AS true; +-- Postgres uses '\' as the default escape character, which is not per spec +SELECT 'abcdefg' SIMILAR TO '_bcd\%' AS false; +-- and an empty string to mean "no escape", which is also not per spec +SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; +-- these behaviors are per spec, though: +SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; +SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; -- Test backslash escapes in regexp_replace's replacement string --@ SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');