Skip to content

Commit

Permalink
feat(expr): switch to fancy-regex crate & update the original versi…
Browse files Browse the repository at this point in the history
…on (#12329)

Co-authored-by: xzhseh <[email protected]>
  • Loading branch information
xzhseh and xzhseh authored Sep 16, 2023
1 parent 0032145 commit 31fdc26
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 11 deletions.
21 changes: 21 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions e2e_test/batch/basic/func.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,30 @@ select regexp_replace('💩💩💩💩💩foo🤔️bar亲爱的😭baz这不
----
💩💩💩💩💩foo🤔️bar亲爱的😭这是🥵爱情❤️‍🔥

# Positive Lookahead
query T
select regexp_replace('foobarbaz', 'a(?=r)', 'X');
----
foobXrbaz

# Negative Lookahead
query T
select regexp_replace('chocolate', 'o(?!c)', 'X');
----
chocXlate

# Positive Lookbehind
query T
select regexp_replace('foobarXaz', '(?<=X)a', 'X');
----
foobarXXz

# Negative Lookbehind
query T
select regexp_replace('foobarXaz', '(?<!X)a', 'X');
----
foobXrXaz

query T
select regexp_count('ABCABCAXYaxy', 'A.');
----
Expand Down
1 change: 1 addition & 0 deletions src/expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ ctor = "0.2"
downcast-rs = "1.2"
easy-ext = "1"
either = "1"
fancy-regex = "0.11.0"
futures = { version = "0.3", default-features = false, features = ["alloc"] }
futures-async-stream = { workspace = true }
futures-util = "0.3"
Expand Down
1 change: 1 addition & 0 deletions src/expr/src/table_function/regexp_matches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ fn regexp_matches<'a>(
// ignored in PostgreSQL's behavior.
let skip_flag = regex.regex.captures_len() > 1;
let list = capture
.unwrap()
.iter()
.skip(if skip_flag { 1 } else { 0 })
.map(|mat| mat.map(|m| m.as_str().into()))
Expand Down
30 changes: 19 additions & 11 deletions src/expr/src/vector_op/regexp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
use std::str::FromStr;

use regex::{Regex, RegexBuilder};
use fancy_regex::{Regex, RegexBuilder};
use risingwave_common::array::ListValue;
use risingwave_expr_macro::function;

Expand All @@ -32,10 +32,17 @@ pub struct RegexpContext {
impl RegexpContext {
fn new(pattern: &str, flags: &str, replacement: &str) -> Result<Self> {
let options = RegexpOptions::from_str(flags)?;

let origin = if options.case_insensitive {
format!("(?i:{})", pattern)
} else {
pattern.to_string()
};

Ok(Self {
regex: RegexBuilder::new(pattern)
.case_insensitive(options.case_insensitive)
.build()?,
regex: RegexBuilder::new(&origin)
.build()
.map_err(|e| ExprError::Parse(e.to_string().into()))?,
global: options.global,
replacement: make_replacement(replacement),
})
Expand Down Expand Up @@ -142,7 +149,7 @@ fn regexp_match(text: &str, regex: &RegexpContext) -> Option<ListValue> {
// If there are multiple captures, then the first one is the whole match, and should be
// ignored in PostgreSQL's behavior.
let skip_first = regex.regex.captures_len() > 1;
let capture = regex.regex.captures(text)?;
let capture = regex.regex.captures(text).unwrap()?;
let list = capture
.iter()
.skip(if skip_first { 1 } else { 0 })
Expand Down Expand Up @@ -190,7 +197,7 @@ fn regexp_count(text: &str, start: i32, regex: &RegexpContext) -> Result<i32> {
};

let mut count = 0;
while let Some(captures) = regex.regex.captures(&text[start..]) {
while let Ok(Some(captures)) = regex.regex.captures(&text[start..]) {
count += 1;
start += captures.get(0).unwrap().end();
}
Expand Down Expand Up @@ -297,7 +304,7 @@ fn regexp_replace(
let mut ret = text[..search_start].to_string();

// Begin the actual replace logic
while let Some(capture) = ctx.regex.captures(&text[search_start..]) {
while let Ok(Some(capture)) = ctx.regex.captures(&text[search_start..]) {
let match_start = capture.get(0).unwrap().start();
let match_end = capture.get(0).unwrap().end();

Expand Down Expand Up @@ -344,7 +351,7 @@ fn regexp_replace(
let mut count = 1;
// The absolute index for the start of searching
let mut search_start = start;
while let Some(capture) = ctx.regex.captures(&text[search_start..]) {
while let Ok(Some(capture)) = ctx.regex.captures(&text[search_start..]) {
// Get the current start & end index
let match_start = capture.get(0).unwrap().start();
let match_end = capture.get(0).unwrap().end();
Expand Down Expand Up @@ -378,7 +385,7 @@ fn regexp_replace(
if let Some(n) = n {
// Replace only the N-th match
let mut count = 1;
while let Some(capture) = ctx.regex.captures(&text[start..]) {
while let Ok(Some(capture)) = ctx.regex.captures(&text[start..]) {
if count == n {
// We've reached the pattern to replace
let match_start = capture.get(0).unwrap().start();
Expand Down Expand Up @@ -406,12 +413,13 @@ fn regexp_replace(
}
} else {
// `N` is not specified
if ctx.regex.captures(&text[start..]).is_none() {
if let Ok(None) = ctx.regex.captures(&text[start..]) {
// No match
return Ok(text.into());
}

// Otherwise replace the source text
if let Some(capture) = ctx.regex.captures(&text[start..]) {
if let Ok(Some(capture)) = ctx.regex.captures(&text[start..]) {
let match_start = capture.get(0).unwrap().start();
let match_end = capture.get(0).unwrap().end();

Expand Down
2 changes: 2 additions & 0 deletions src/workspace-hack/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ aws-credential-types = { version = "0.55", default-features = false, features =
aws-sdk-s3 = { version = "0.28", features = ["native-tls"] }
aws-smithy-client = { version = "0.55", default-features = false, features = ["native-tls", "rustls"] }
base64 = { version = "0.21", features = ["alloc"] }
bit-vec = { version = "0.6" }
bitflags = { version = "2", default-features = false, features = ["std"] }
byteorder = { version = "1", features = ["i128"] }
bytes = { version = "1", features = ["serde"] }
Expand Down Expand Up @@ -121,6 +122,7 @@ aws-credential-types = { version = "0.55", default-features = false, features =
aws-sdk-s3 = { version = "0.28", features = ["native-tls"] }
aws-smithy-client = { version = "0.55", default-features = false, features = ["native-tls", "rustls"] }
base64 = { version = "0.21", features = ["alloc"] }
bit-vec = { version = "0.6" }
bitflags = { version = "2", default-features = false, features = ["std"] }
byteorder = { version = "1", features = ["i128"] }
bytes = { version = "1", features = ["serde"] }
Expand Down

0 comments on commit 31fdc26

Please sign in to comment.