risingwavelabs · TennyZhuang · Oct 17, 2023 · Oct 14, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/e2e_test/batch/basic/func.slt.part b/e2e_test/batch/basic/func.slt.part
@@ -650,3 +650,48 @@ query T
 select regexp_count('💩💩💩💩💩foo🤔️bar亲爱的😭baz这不是爱情❤️‍🔥', '亲爱的😭', 1, 'i');
 ----
 1
+
+query T
+select regexp_split_to_array('apple,banana,orange', ',');
+----
+{apple,banana,orange}
+
+query T
+select regexp_split_to_array('apple.banana!orange', '[.!]');
+----
+{apple,banana,orange}
+
+query T
+select regexp_split_to_array('applebananaorange', ',');
+----
+{applebananaorange}
+
+query T
+select regexp_split_to_array('', ',');
+----
+{""}
+
+query T
+select regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s+');
+----
+{the,quick,brown,fox,jumps,over,the,lazy,dog}
+
+query T
+select regexp_split_to_array('the quick brown fox jumps over the lazy dog', '\s*');
+----
+{t,h,e,q,u,i,c,k,b,r,o,w,n,f,o,x,j,u,m,p,s,o,v,e,r,t,h,e,l,a,z,y,d,o,g}
+
+query T
+select regexp_split_to_array('apple\\banana,orange', '\\\\|,');
+----
+{apple,banana,orange}
+
+query T
+select regexp_split_to_array('apple!!!banana?orange???grape', '[!?]+');
+----
+{apple,banana,orange,grape}
+
+query T
+select regexp_split_to_array('💩💩💩💩💩foo🤔️bar亲爱的😭baz这不是爱情❤️‍🔥', '亲爱的😭');
+----
+{💩💩💩💩💩foo🤔️bar,baz这不是爱情❤️‍🔥}
diff --git a/proto/expr.proto b/proto/expr.proto
@@ -104,6 +104,7 @@ message ExprNode {
     REGEXP_MATCH = 232;
     REGEXP_REPLACE = 280;
     REGEXP_COUNT = 281;
+    REGEXP_SPLIT_TO_ARRAY = 282;
     POW = 233;
     EXP = 234;
     CHR = 235;

diff --git a/src/expr/impl/src/scalar/regexp.rs b/src/expr/impl/src/scalar/regexp.rs
@@ -18,6 +18,7 @@ use std::str::FromStr;
 
 use fancy_regex::{Regex, RegexBuilder};
 use risingwave_common::array::ListValue;
+use risingwave_common::types::ScalarImpl;
 use risingwave_expr::{bail, function, ExprError, Result};
 
 #[derive(Debug)]
@@ -438,3 +439,85 @@ fn regexp_replace(
         Ok(ret.into())
     }
 }
+
+#[function(
+    // regexp_split_to_array(source, pattern)
+    "regexp_split_to_array(varchar, varchar) -> varchar[]",
+    prebuild = "RegexpContext::from_pattern($1)?"
+)]
+#[function(
+    // regexp_split_to_array(source, pattern, flags)
+    "regexp_split_to_array(varchar, varchar, varchar) -> varchar[]",
+    prebuild = "RegexpContext::from_pattern_flags($1, $2)?"
+)]
+fn regexp_split_to_array(text: &str, regex: &RegexpContext) -> Option<ListValue> {
+    let n = text.len();
+    let mut start = 0;
+    let mut list: Vec<Option<ScalarImpl>> = Vec::new();
+    let mut empty_flag = false;
+
+    loop {
+        if start >= n {
+            // Prevent overflow
+            break;
+        }
+
+        let capture = regex.regex.captures(&text[start..]).unwrap();
+
+        if capture.is_none() {
+            break;
+        }
+
+        let whole_match = capture.unwrap().get(0);
+        debug_assert!(whole_match.is_some(), "Expected `whole_match` to be valid");
+
+        let begin = whole_match.unwrap().start() + start;
+        let end = whole_match.unwrap().end() + start;
+
+        if begin == end {
+            // Empty match (i.e., `\s*`)
+            empty_flag = true;
+
+            if begin == text.len() {
+                // We do not need to push extra stuff to the result list
+                start = begin;
+                break;
+            }
+            list.push(Some(text[start..begin + 1].into()));
+            start = end + 1;
+            continue;
+        }
+
+        if start == begin {
+            // The before match is possibly empty
+            if !empty_flag {
+                // We'll push an empty string to conform with postgres
+                // If there does not exists a empty match before
+                list.push(Some("".to_string().into()));
+            }
+            start = end;
+            continue;
+        }
+
+        if begin != 0 {
+            // Normal case
+            list.push(Some(text[start..begin].into()));
+        }
+
+        // We should update the `start` no matter `begin` is zero or not
+        start = end;
+    }
+
+    if start < n {
+        // Push the extra text to the list
+        // Note that this will implicitly push the entire text to the list
+        // If there is no match, which is the expected behavior
+        list.push(Some(text[start..].into()));
+    }
+
+    if start == n && !empty_flag {
+        list.push(Some("".to_string().into()));
+    }
+
+    Some(ListValue::new(list))
+}
diff --git a/src/frontend/src/binder/expr/function.rs b/src/frontend/src/binder/expr/function.rs
@@ -781,6 +781,7 @@ impl Binder {
                 ("regexp_match", raw_call(ExprType::RegexpMatch)),
                 ("regexp_replace", raw_call(ExprType::RegexpReplace)),
                 ("regexp_count", raw_call(ExprType::RegexpCount)),
+                ("regexp_split_to_array", raw_call(ExprType::RegexpSplitToArray)),
                 ("chr", raw_call(ExprType::Chr)),
                 ("starts_with", raw_call(ExprType::StartsWith)),
                 ("initcap", raw_call(ExprType::Initcap)),

diff --git a/src/frontend/src/expr/pure.rs b/src/frontend/src/expr/pure.rs
@@ -105,6 +105,7 @@ impl ExprVisitor<bool> for ImpureAnalyzer {
             | expr_node::Type::RegexpMatch
             | expr_node::Type::RegexpReplace
             | expr_node::Type::RegexpCount
+            | expr_node::Type::RegexpSplitToArray
             | expr_node::Type::Pow
             | expr_node::Type::Exp
             | expr_node::Type::Ln

diff --git a/src/tests/regress/data/sql/strings.sql b/src/tests/regress/data/sql/strings.sql
@@ -239,20 +239,24 @@ SELECT SUBSTRING('string' FROM -10 FOR -2147483646) AS "error";
 --@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s*$re$) AS foo;
 --@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s*$re$);
 --@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '') AS foo;
---@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '');
+SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', '');
+
 --@ -- case insensitive
 --@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i') AS foo;
---@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i');
---@ -- no match of pattern
+SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'i');
+
+-- no match of pattern
 --@ SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', 'nomatch') AS foo;
---@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch');
---@ -- some corner cases
---@ SELECT regexp_split_to_array('123456','1');
---@ SELECT regexp_split_to_array('123456','6');
---@ SELECT regexp_split_to_array('123456','.');
---@ SELECT regexp_split_to_array('123456','');
---@ SELECT regexp_split_to_array('123456','(?:)');
---@ SELECT regexp_split_to_array('1','');
+SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nomatch');
+
+-- some corner cases
+SELECT regexp_split_to_array('123456','1');
+SELECT regexp_split_to_array('123456','6');
+SELECT regexp_split_to_array('123456','.');
+SELECT regexp_split_to_array('123456','');
+SELECT regexp_split_to_array('123456','(?:)');
+SELECT regexp_split_to_array('1','');
+
 --@ -- errors
 --@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
 --@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');