Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(expr): add regexp_split_to_array #12844

Merged
merged 5 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions e2e_test/batch/basic/func.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -650,3 +650,48 @@ query T
select regexp_count('💩💩💩💩💩foo🤔️bar亲爱的😭baz这不是爱情❤️‍🔥', '亲爱的😭', 1, 'i');
----
1

query T
select regexp_split_to_array('apple,banana,orange', ',');
----
{apple,banana,orange}

query T
select regexp_split_to_array('apple.banana!orange', '[.!]');
----
{apple,banana,orange}

query T
select regexp_split_to_array('applebananaorange', ',');
----
{applebananaorange}

query T
select regexp_split_to_array('', ',');
----
{""}

query T
select regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s+');
----
{the,quick,brown,fox,jumps,over,the,lazy,dog}

query T
select regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s*');
----
{t,h,e,q,u,i,c,k,b,r,o,w,n,f,o,x,j,u,m,p,s,o,v,e,r,t,h,e,l,a,z,y,d,o,g}

query T
select regexp_split_to_array('apple\\banana,orange', '\\\\|,');
----
{apple,banana,orange}

query T
select regexp_split_to_array('apple!!!banana?orange???grape', '[!?]+');
----
{apple,banana,orange,grape}

query T
select regexp_split_to_array('💩💩💩💩💩foo🤔️bar亲爱的😭baz这不是爱情❤️‍🔥', '亲爱的😭');
----
{💩💩💩💩💩foo🤔️bar,baz这不是爱情❤️‍🔥}
1 change: 1 addition & 0 deletions proto/expr.proto
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ message ExprNode {
REGEXP_MATCH = 232;
REGEXP_REPLACE = 280;
REGEXP_COUNT = 281;
REGEXP_SPLIT_TO_ARRAY = 282;
POW = 233;
EXP = 234;
CHR = 235;
Expand Down
83 changes: 83 additions & 0 deletions src/expr/impl/src/scalar/regexp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::str::FromStr;

use fancy_regex::{Regex, RegexBuilder};
use risingwave_common::array::ListValue;
use risingwave_common::types::ScalarImpl;
use risingwave_expr::{bail, function, ExprError, Result};

#[derive(Debug)]
Expand Down Expand Up @@ -438,3 +439,85 @@ fn regexp_replace(
Ok(ret.into())
}
}

#[function(
// regexp_split_to_array(source, pattern)
"regexp_split_to_array(varchar, varchar) -> varchar[]",
prebuild = "RegexpContext::from_pattern($1)?"
)]
#[function(
// regexp_split_to_array(source, pattern, flags)
"regexp_split_to_array(varchar, varchar, varchar) -> varchar[]",
prebuild = "RegexpContext::from_pattern_flags($1, $2)?"
)]
fn regexp_split_to_array(text: &str, regex: &RegexpContext) -> Option<ListValue> {
let n = text.len();
let mut start = 0;
let mut list: Vec<Option<ScalarImpl>> = Vec::new();
let mut empty_flag = false;

loop {
if start >= n {
// Prevent overflow
break;
}

let capture = regex.regex.captures(&text[start..]).unwrap();

if capture.is_none() {
break;
}

let whole_match = capture.unwrap().get(0);
debug_assert!(whole_match.is_some(), "Expected `whole_match` to be valid");

let begin = whole_match.unwrap().start() + start;
let end = whole_match.unwrap().end() + start;

if begin == end {
// Empty match (i.e., `\s*`)
empty_flag = true;

if begin == text.len() {
// We do not need to push extra stuff to the result list
start = begin;
break;
}
list.push(Some(text[start..begin + 1].into()));
start = end + 1;
continue;
}

if start == begin {
// The before match is possibly empty
if !empty_flag {
// We'll push an empty string to conform with postgres
// If there does not exists a empty match before
list.push(Some("".to_string().into()));
}
start = end;
continue;
}

if begin != 0 {
// Normal case
list.push(Some(text[start..begin].into()));
}

// We should update the `start` no matter `begin` is zero or not
start = end;
}

if start < n {
// Push the extra text to the list
// Note that this will implicitly push the entire text to the list
// If there is no match, which is the expected behavior
list.push(Some(text[start..].into()));
}

if start == n && !empty_flag {
list.push(Some("".to_string().into()));
}

Some(ListValue::new(list))
}
1 change: 1 addition & 0 deletions src/frontend/src/binder/expr/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ impl Binder {
("regexp_match", raw_call(ExprType::RegexpMatch)),
("regexp_replace", raw_call(ExprType::RegexpReplace)),
("regexp_count", raw_call(ExprType::RegexpCount)),
("regexp_split_to_array", raw_call(ExprType::RegexpSplitToArray)),
("chr", raw_call(ExprType::Chr)),
("starts_with", raw_call(ExprType::StartsWith)),
("initcap", raw_call(ExprType::Initcap)),
Expand Down
1 change: 1 addition & 0 deletions src/frontend/src/expr/pure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ impl ExprVisitor<bool> for ImpureAnalyzer {
| expr_node::Type::RegexpMatch
| expr_node::Type::RegexpReplace
| expr_node::Type::RegexpCount
| expr_node::Type::RegexpSplitToArray
| expr_node::Type::Pow
| expr_node::Type::Exp
| expr_node::Type::Ln
Expand Down
26 changes: 15 additions & 11 deletions src/tests/regress/data/sql/strings.sql
Original file line number Diff line number Diff line change
Expand Up @@ -239,20 +239,24 @@ SELECT SUBSTRING('string' FROM -10 FOR -2147483646) AS "error";
--@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s*$re$) AS foo;
--@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s*$re$);
--@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '') AS foo;
--@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '');
SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '');

--@ -- case insensitive
--@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i') AS foo;
--@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i');
--@ -- no match of pattern
SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i');

-- no match of pattern
--@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', 'nomatch') AS foo;
--@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch');
--@ -- some corner cases
--@ SELECT regexp_split_to_array('123456','1');
--@ SELECT regexp_split_to_array('123456','6');
--@ SELECT regexp_split_to_array('123456','.');
--@ SELECT regexp_split_to_array('123456','');
--@ SELECT regexp_split_to_array('123456','(?:)');
--@ SELECT regexp_split_to_array('1','');
SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch');

-- some corner cases
SELECT regexp_split_to_array('123456','1');
SELECT regexp_split_to_array('123456','6');
SELECT regexp_split_to_array('123456','.');
SELECT regexp_split_to_array('123456','');
SELECT regexp_split_to_array('123456','(?:)');
SELECT regexp_split_to_array('1','');

--@ -- errors
--@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
--@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');
Expand Down
Loading