Skip to content

Commit

Permalink
feat(expr): implement `<expr> [ NOT ] SIMILAR TO <pat> [ ESCAPE <esc_…
Browse files Browse the repository at this point in the history
…text> ]` (#14000)

Co-authored-by: Runji Wang <[email protected]>
  • Loading branch information
jetjinser and wangrunji0408 authored Dec 21, 2023
1 parent dd7bd7a commit 8c0a1a4
Show file tree
Hide file tree
Showing 11 changed files with 447 additions and 13 deletions.
98 changes: 98 additions & 0 deletions e2e_test/batch/functions/similar_to_escape.slt.part
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
statement ok
create table t (id int, pat varchar, text varchar);

# example data from https://www.postgresql.org/docs/16/functions-matching.html#FUNCTIONS-SIMILARTO-REGEXP
statement ok
insert into
t
values
(1, 'abc', 'abc'),
(2, 'a', 'abc'),
(3, '%(b|d)%', 'abc'),
(4, '(b|c)%', 'abc'),
(5, '%abc-%', '-abc-'),
(6, '%abc-%', 'xabcy');

query B
select (text similar to pat) from t order by id;
----
t
f
t
f
t
f

query B
select (text not similar to pat) from t order by id;
----
f
t
f
t
f
t

query T
select text from t where text similar to pat order by id;
----
abc
abc
-abc-

query T
select text from t where text not similar to pat order by id;
----
abc
abc
xabcy

query I
select count(1) from t where text similar to 'ab%';
----
4

query I
select count(1) from t where text not similar to 'ab%';
----
2

query B
select 'foobar' similar to '%#"o_b#"%';
----
f

# default escape string
query B
select 'foobar' similar to '%#"o_b#"%' escape '\';
----
f

# fallback to default escape string
query B
select 'foobar' similar to '%#"o_b#"%' escape '';
----
f

query B
select 'foobar' similar to '%#"o_b#"%' escape '#';
----
t

query B
select 'foobar' not similar to '%#"o_b#"%' escape '#';
----
f

query B
select 'foobar' similar to '%🤡"o_b🤡"%' escape '🤡';
----
t

query B
select 'foobar' not similar to '%👪"o_b👪"%' escape '👪';
----
f

statement ok
drop table t;
1 change: 1 addition & 0 deletions proto/expr.proto
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ message ExprNode {
LENGTH = 203;
LIKE = 204;
I_LIKE = 279;
SIMILAR_TO_ESCAPE = 284;
UPPER = 205;
LOWER = 206;
TRIM = 207;
Expand Down
1 change: 1 addition & 0 deletions src/expr/impl/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ mod repeat;
mod replace;
mod round;
mod sha;
mod similar_to_escape;
mod split_part;
mod string;
mod string_to_array;
Expand Down
224 changes: 224 additions & 0 deletions src/expr/impl/src/scalar/similar_to_escape.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Copyright 2023 RisingWave Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::fmt::Write;

use risingwave_expr::{function, ExprError, Result};

// escape `similar-to` pattern to POSIX regex pattern
// Adapted from:
// https://github.com/postgres/postgres/blob/REL_16_STABLE/src/backend/utils/adt/regexp.c#L768
fn similar_escape_internal(
pat: &str,
esc_text: Option<char>,
writer: &mut impl Write,
) -> std::result::Result<(), ExprError> {
macro_rules! write_ {
($s:expr) => {
write!(writer, "{}", $s).unwrap()
};
}

write_!("^(?:");

let mut nquotes = 0;
let mut afterescape = false;
let mut incharclass = false;

for chr in pat.chars() {
match chr {
c if afterescape => {
if c == '"' && !incharclass {
match nquotes {
0 => write_!("){1,1}?("),
1 => write_!("){1,1}(?:"),
_ => {
return Err(ExprError::InvalidParam {
name: "pat",
reason: "SQL regular expression may not contain more than two escape-double-quote separators".into()
});
}
}
nquotes += 1;
} else {
write_!('\\');
write_!(c);
}

afterescape = false;
}
c if esc_text.is_some() && c == esc_text.unwrap() => {
afterescape = true;
}
c if incharclass => {
if c == '\\' {
write_!('\\');
}
write_!(c);

if c == ']' {
incharclass = false;
}
}
c @ '[' => {
write_!(c);
incharclass = true;
}
'%' => {
write_!(".*");
}
'_' => {
write_!('.');
}
'(' => {
// convert to non-capturing parenthesis
write_!("(?:");
}
c @ ('\\' | '.' | '^' | '$') => {
write_!('\\');
write_!(c);
}
c => {
write_!(c);
}
}
}

write_!(")$");

Ok(())
}

#[function(
// x SIMILAR TO y -> x ~ similar_to_escape(y)
"similar_to_escape(varchar) -> varchar",
)]
fn similar_to_escape_default(pat: &str, writer: &mut impl Write) -> Result<()> {
similar_escape_internal(pat, Some('\\'), writer)
}

#[function(
// x SIMILAR TO y ESCAPE z -> x ~ similar_to_escape(y, z)
"similar_to_escape(varchar, varchar) -> varchar"
)]
fn similar_to_escape_with_escape_text(
pat: &str,
esc_text: &str,
writer: &mut impl Write,
) -> Result<()> {
if esc_text.chars().nth(1).is_some() {
return Err(ExprError::InvalidParam {
name: "escape string",
reason: format!(
"Invalid escape string: `{}`, must be empty or one character",
esc_text
)
.into(),
});
}

similar_escape_internal(pat, esc_text.chars().next(), writer)
}

#[cfg(test)]
mod tests {
use super::{similar_to_escape_default, similar_to_escape_with_escape_text};

#[test]
fn test_default_escape() {
let cases = vec![
("", "^(?:)$"),
("_bcd%", r#"^(?:.bcd.*)$"#),
("bcd%", r#"^(?:bcd.*)$"#),
(r#"_bcd\%"#, r#"^(?:.bcd\%)$"#),
("bcd[]ee", "^(?:bcd[]ee)$"),
(r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
(r#"%\"o_b\"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
];

for (pat, escaped) in cases {
let mut writer = String::new();
similar_to_escape_default(pat, &mut writer).ok();
assert_eq!(writer, escaped);
}

// may not contain more than two escape-double-quote separators
// 3 double quotes (> 2)
let pat = r#"one\"two\"three\"four"#;
let mut writer = String::new();
let res = similar_to_escape_default(pat, &mut writer);
assert!(res.is_err());
}

#[test]
fn test_escape_with_escape_text() {
let cases = vec![
("", "^(?:)$"),
("_bcd%", "^(?:.bcd.*)$"),
("bcd%", "^(?:bcd.*)$"),
(r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
("bcd[]ee", "^(?:bcd[]ee)$"),
(r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
(r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
(r#"%#"o_b#"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
];

for (pat, escaped) in cases {
let mut writer = String::new();
similar_to_escape_with_escape_text(pat, "#", &mut writer).ok();
assert_eq!(writer, escaped);
}

let pat = "xxx";
let mut writer = String::new();
let res = similar_to_escape_with_escape_text(pat, "##", &mut writer);
assert!(res.is_err())
}

#[test]
fn test_escape_with_escape_unicode() {
let cases = vec![
("", "^(?:)$"),
("_bcd%", "^(?:.bcd.*)$"),
("bcd%", "^(?:bcd.*)$"),
(r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
("bcd[]ee", "^(?:bcd[]ee)$"),
(r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
(r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
(r#"%💅"o_b💅"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
];

for (pat, escaped) in cases {
let mut writer = String::new();
similar_to_escape_with_escape_text(pat, "💅", &mut writer).ok();
assert_eq!(writer, escaped);
}

let pat = "xxx";
let mut writer = String::new();
let res = similar_to_escape_with_escape_text(pat, "💅💅", &mut writer);
assert!(res.is_err())
}
}
41 changes: 41 additions & 0 deletions src/frontend/src/binder/expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ impl Binder {
low,
high,
} => self.bind_between(*expr, negated, *low, *high),
Expr::SimilarTo {
expr,
negated,
pat,
esc_text,
} => self.bind_similar_to(*expr, negated, *pat, esc_text),
Expr::InList {
expr,
list,
Expand Down Expand Up @@ -413,6 +419,41 @@ impl Binder {
Ok(func_call.into())
}

/// Bind `<expr> [ NOT ] SIMILAR TO <pat> ESCAPE <esc_text>`
pub(super) fn bind_similar_to(
&mut self,
expr: Expr,
negated: bool,
pat: Expr,
esc_text: Option<Box<Expr>>,
) -> Result<ExprImpl> {
let expr = self.bind_expr_inner(expr)?;
let pat = self.bind_expr_inner(pat)?;

let esc_inputs = if let Some(et) = esc_text {
let esc_text = self.bind_expr_inner(*et)?;
vec![pat, esc_text]
} else {
vec![pat]
};

let esc_call =
FunctionCall::new_unchecked(ExprType::SimilarToEscape, esc_inputs, DataType::Varchar);

let regex_call = FunctionCall::new_unchecked(
ExprType::RegexpEq,
vec![expr, esc_call.into()],
DataType::Boolean,
);
let func_call = if negated {
FunctionCall::new_unchecked(ExprType::Not, vec![regex_call.into()], DataType::Boolean)
} else {
regex_call
};

Ok(func_call.into())
}

pub(super) fn bind_case(
&mut self,
operand: Option<Box<Expr>>,
Expand Down
Loading

0 comments on commit 8c0a1a4

Please sign in to comment.