Skip to content

Commit

Permalink
Use [\u4e00-\u9fa5]+ regex rewrite kernel (#7)
Browse files Browse the repository at this point in the history
* A hacky approach for regexpr rewrite

Signed-off-by: Haoyang Li <[email protected]>

* Use contains instead for that case

Signed-off-by: Haoyang Li <[email protected]>

* add config to switch

Signed-off-by: Haoyang Li <[email protected]>

* Rewrite some rlike expression to StartsWith/EndsWith/Contains

Signed-off-by: Haoyang Li <[email protected]>

* clean up

Signed-off-by: Haoyang Li <[email protected]>

* wip

Signed-off-by: Haoyang Li <[email protected]>

* wip

Signed-off-by: Haoyang Li <[email protected]>

* add tests and config

Signed-off-by: Haoyang Li <[email protected]>

* support range filter

Signed-off-by: Haoyang Li <[email protected]>

---------

Signed-off-by: Haoyang Li <[email protected]>
  • Loading branch information
thirtiseven authored and nvliyuan committed May 6, 2024
1 parent aac469e commit 6cafecf
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 7 deletions.
14 changes: 13 additions & 1 deletion integration_tests/src/main/python/regexp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,17 @@ def test_regexp_rlike_rewrite_optimization_str_dig():
'regexp_like(a, "[0-9]{4,}")',
'regexp_like(a, "abcd([0-9]{5})")'),
conf=_regexp_conf)

# [\\u4e00-\\u9fa5]+

@pytest.mark.skipif(is_before_spark_320(), reason='regexp_like is synonym for RLike starting in Spark 3.2.0')
def test_regexp_rlike_rewrite_optimization_chinese():
gen = mk_str_gen('[0-9]{0,2}([英伟达]{0,3})?[a-z]{0,2}')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'a',
'regexp_like(a, "[\\u4e00-\\u9fa5]+")'),
conf=_regexp_conf)

def test_regexp_replace_character_set_negated():
gen = mk_str_gen('[abcd]{0,3}[\r\n]{0,2}[abcd]{0,3}')
Expand Down Expand Up @@ -594,6 +605,7 @@ def test_character_classes():
),
conf=_regexp_conf)

@datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10641")
def test_regexp_choice():
gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}')
assert_gpu_and_cpu_are_equal_collect(
Expand All @@ -617,7 +629,7 @@ def test_regexp_hexadecimal_digits():
gen = mk_str_gen(
'[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen, length=10).selectExpr(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "\\\\x7f")',
'rlike(a, "\\\\x80")',
'rlike(a, "[\\\\xa0-\\\\xf0]")',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,7 @@ object RegexprPart {
case object End extends RegexprPart // $
case object Wildcard extends RegexprPart // .* or (.*)
case class Digits(from: Int, to: Int) extends RegexprPart // [0-9]{a, b}
case object Chinese extends RegexprPart // Chinese characters [\u4e00-\u9fa5]+
case class Fixstring(name: String) extends RegexprPart // normal string without special characters
case class Regexpr(value: String) extends RegexprPart // other strings
}
Expand Down Expand Up @@ -1096,6 +1097,8 @@ class GpuRLikeMeta(
Wildcard :: parseRegexToParts(s.substring(4))
case s if s.endsWith("(.*)") =>
parseRegexToParts(s.substring(0, s.length - 4)) :+ Wildcard
case s if s.startsWith("[\u4e00-\u9fa5]+") =>
parseRegexToParts(s.substring(0, s.length - 6)) :+ Chinese
case s if s.endsWith("([0-9]{5})") =>
parseRegexToParts(s.substring(0, s.length - 10)) :+ Digits(5, 5)
case s if s.endsWith("[0-9]{4,}") =>
Expand Down Expand Up @@ -1125,15 +1128,20 @@ class GpuRLikeMeta(
case Fixstring(s) :: List(End) => {
GpuEndsWith(lhs, GpuLiteral(s, StringType))
}
case Chinese :: rest
if rest == List() || rest.forall(_ == Wildcard) => {
// println(s"!!!GpuStringDigits chinese")
GpuStringDigits(lhs, GpuLiteral("", StringType), 1, 19968, 40869)
}
case Digits(from, _) :: rest
if rest == List() || rest.forall(_ == Wildcard) => {
// println(s"!!!GpuStringDigits1: $from")
GpuStringDigits(lhs, GpuLiteral("", StringType), from)
// println(s"!!!GpuStringDigits1")
GpuStringDigits(lhs, GpuLiteral("", StringType), from, 48, 57)
}
case Fixstring(s) :: Digits(from, _) :: rest
if rest == List() || rest.forall(_ == Wildcard) => {
// println(s"!!!GpuStringDigits2: $s, $from")
GpuStringDigits(lhs, GpuLiteral(s, StringType), from)
// println(s"!!!GpuStringDigits2")
GpuStringDigits(lhs, GpuLiteral(s, StringType), from, 48, 57)
}
case Fixstring(s) :: rest
if rest == List() || rest.forall(_ == Wildcard) => {
Expand Down Expand Up @@ -1217,15 +1225,15 @@ class GpuRLikeMeta(
}
}

case class GpuStringDigits(left: Expression, right: Expression, from: Int)
case class GpuStringDigits(left: Expression, right: Expression, from: Int, start: Int, end: Int)
extends GpuBinaryExpressionArgsAnyScalar with ImplicitCastInputTypes with NullIntolerant {

override def dataType: DataType = BooleanType

override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)

override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
StringDigitsPattern.stringDigitsPattern(lhs.getBase, rhs.getBase, from)
StringDigitsPattern.stringDigitsPattern(lhs.getBase, rhs.getBase, from, start, end)
}

override def doColumnar(numRows: Int, lhs: GpuScalar, rhs: GpuScalar): ColumnVector = {
Expand Down

0 comments on commit 6cafecf

Please sign in to comment.