Skip to content

Commit

Permalink
feat(expr): add regexp_split_to_array (#12844)
Browse files Browse the repository at this point in the history
Signed-off-by: TennyZhuang <[email protected]>
Co-authored-by: TennyZhuang <[email protected]>
  • Loading branch information
xzhseh and TennyZhuang authored Oct 17, 2023
1 parent 657a8ec commit 0f61b00
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 11 deletions.
45 changes: 45 additions & 0 deletions e2e_test/batch/basic/func.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -650,3 +650,48 @@ query T
select regexp_count('💩💩💩💩💩foo🤔️bar亲爱的😭baz这不是爱情❤️‍🔥', '亲爱的😭', 1, 'i');
----
1

query T
select regexp_split_to_array('apple,banana,orange', ',');
----
{apple,banana,orange}

query T
select regexp_split_to_array('apple.banana!orange', '[.!]');
----
{apple,banana,orange}

query T
select regexp_split_to_array('applebananaorange', ',');
----
{applebananaorange}

query T
select regexp_split_to_array('', ',');
----
{""}

query T
select regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s+');
----
{the,quick,brown,fox,jumps,over,the,lazy,dog}

query T
select regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s*');
----
{t,h,e,q,u,i,c,k,b,r,o,w,n,f,o,x,j,u,m,p,s,o,v,e,r,t,h,e,l,a,z,y,d,o,g}

query T
select regexp_split_to_array('apple\\banana,orange', '\\\\|,');
----
{apple,banana,orange}

query T
select regexp_split_to_array('apple!!!banana?orange???grape', '[!?]+');
----
{apple,banana,orange,grape}

query T
select regexp_split_to_array('💩💩💩💩💩foo🤔️bar亲爱的😭baz这不是爱情❤️‍🔥', '亲爱的😭');
----
{💩💩💩💩💩foo🤔️bar,baz这不是爱情❤️‍🔥}
1 change: 1 addition & 0 deletions proto/expr.proto
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ message ExprNode {
REGEXP_MATCH = 232;
REGEXP_REPLACE = 280;
REGEXP_COUNT = 281;
REGEXP_SPLIT_TO_ARRAY = 282;
POW = 233;
EXP = 234;
CHR = 235;
Expand Down
83 changes: 83 additions & 0 deletions src/expr/impl/src/scalar/regexp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::str::FromStr;

use fancy_regex::{Regex, RegexBuilder};
use risingwave_common::array::ListValue;
use risingwave_common::types::ScalarImpl;
use risingwave_expr::{bail, function, ExprError, Result};

#[derive(Debug)]
Expand Down Expand Up @@ -438,3 +439,85 @@ fn regexp_replace(
Ok(ret.into())
}
}

#[function(
// regexp_split_to_array(source, pattern)
"regexp_split_to_array(varchar, varchar) -> varchar[]",
prebuild = "RegexpContext::from_pattern($1)?"
)]
#[function(
// regexp_split_to_array(source, pattern, flags)
"regexp_split_to_array(varchar, varchar, varchar) -> varchar[]",
prebuild = "RegexpContext::from_pattern_flags($1, $2)?"
)]
fn regexp_split_to_array(text: &str, regex: &RegexpContext) -> Option<ListValue> {
let n = text.len();
let mut start = 0;
let mut list: Vec<Option<ScalarImpl>> = Vec::new();
let mut empty_flag = false;

loop {
if start >= n {
// Prevent overflow
break;
}

let capture = regex.regex.captures(&text[start..]).unwrap();

if capture.is_none() {
break;
}

let whole_match = capture.unwrap().get(0);
debug_assert!(whole_match.is_some(), "Expected `whole_match` to be valid");

let begin = whole_match.unwrap().start() + start;
let end = whole_match.unwrap().end() + start;

if begin == end {
// Empty match (i.e., `\s*`)
empty_flag = true;

if begin == text.len() {
// We do not need to push extra stuff to the result list
start = begin;
break;
}
list.push(Some(text[start..begin + 1].into()));
start = end + 1;
continue;
}

if start == begin {
// The before match is possibly empty
if !empty_flag {
// We'll push an empty string to conform with postgres
// If there does not exists a empty match before
list.push(Some("".to_string().into()));
}
start = end;
continue;
}

if begin != 0 {
// Normal case
list.push(Some(text[start..begin].into()));
}

// We should update the `start` no matter `begin` is zero or not
start = end;
}

if start < n {
// Push the extra text to the list
// Note that this will implicitly push the entire text to the list
// If there is no match, which is the expected behavior
list.push(Some(text[start..].into()));
}

if start == n && !empty_flag {
list.push(Some("".to_string().into()));
}

Some(ListValue::new(list))
}
1 change: 1 addition & 0 deletions src/frontend/src/binder/expr/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ impl Binder {
("regexp_match", raw_call(ExprType::RegexpMatch)),
("regexp_replace", raw_call(ExprType::RegexpReplace)),
("regexp_count", raw_call(ExprType::RegexpCount)),
("regexp_split_to_array", raw_call(ExprType::RegexpSplitToArray)),
("chr", raw_call(ExprType::Chr)),
("starts_with", raw_call(ExprType::StartsWith)),
("initcap", raw_call(ExprType::Initcap)),
Expand Down
1 change: 1 addition & 0 deletions src/frontend/src/expr/pure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ impl ExprVisitor<bool> for ImpureAnalyzer {
| expr_node::Type::RegexpMatch
| expr_node::Type::RegexpReplace
| expr_node::Type::RegexpCount
| expr_node::Type::RegexpSplitToArray
| expr_node::Type::Pow
| expr_node::Type::Exp
| expr_node::Type::Ln
Expand Down
26 changes: 15 additions & 11 deletions src/tests/regress/data/sql/strings.sql
Original file line number Diff line number Diff line change
Expand Up @@ -239,20 +239,24 @@ SELECT SUBSTRING('string' FROM -10 FOR -2147483646) AS "error";
--@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s*$re$) AS foo;
--@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s*$re$);
--@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '') AS foo;
--@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '');
SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '');

--@ -- case insensitive
--@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i') AS foo;
--@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i');
--@ -- no match of pattern
SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i');

-- no match of pattern
--@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', 'nomatch') AS foo;
--@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch');
--@ -- some corner cases
--@ SELECT regexp_split_to_array('123456','1');
--@ SELECT regexp_split_to_array('123456','6');
--@ SELECT regexp_split_to_array('123456','.');
--@ SELECT regexp_split_to_array('123456','');
--@ SELECT regexp_split_to_array('123456','(?:)');
--@ SELECT regexp_split_to_array('1','');
SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch');

-- some corner cases
SELECT regexp_split_to_array('123456','1');
SELECT regexp_split_to_array('123456','6');
SELECT regexp_split_to_array('123456','.');
SELECT regexp_split_to_array('123456','');
SELECT regexp_split_to_array('123456','(?:)');
SELECT regexp_split_to_array('1','');

--@ -- errors
--@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
--@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');
Expand Down

0 comments on commit 0f61b00

Please sign in to comment.