From 6cafecf10fa6ae6c66909f4cdffcb54f5ee19455 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 6 May 2024 14:52:28 +0800 Subject: [PATCH] Use [\u4e00-\u9fa5]+ regex rewrite kernel (#7) * A hacky approach for regexpr rewrite Signed-off-by: Haoyang Li * Use contains instead for that case Signed-off-by: Haoyang Li * add config to switch Signed-off-by: Haoyang Li * Rewrite some rlike expression to StartsWith/EndsWith/Contains Signed-off-by: Haoyang Li * clean up Signed-off-by: Haoyang Li * wip Signed-off-by: Haoyang Li * wip Signed-off-by: Haoyang Li * add tests and config Signed-off-by: Haoyang Li * support range filter Signed-off-by: Haoyang Li --------- Signed-off-by: Haoyang Li --- .../src/main/python/regexp_test.py | 14 ++++++++++++- .../spark/sql/rapids/stringFunctions.scala | 20 +++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index e643b3e3a6c..a63a5a0ba6d 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -474,6 +474,17 @@ def test_regexp_rlike_rewrite_optimization_str_dig(): 'regexp_like(a, "[0-9]{4,}")', 'regexp_like(a, "abcd([0-9]{5})")'), conf=_regexp_conf) + +# [\\u4e00-\\u9fa5]+ + +@pytest.mark.skipif(is_before_spark_320(), reason='regexp_like is synonym for RLike starting in Spark 3.2.0') +def test_regexp_rlike_rewrite_optimization_chinese(): + gen = mk_str_gen('[0-9]{0,2}([英伟达]{0,3})?[a-z]{0,2}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a', + 'regexp_like(a, "[\\u4e00-\\u9fa5]+")'), + conf=_regexp_conf) def test_regexp_replace_character_set_negated(): gen = mk_str_gen('[abcd]{0,3}[\r\n]{0,2}[abcd]{0,3}') @@ -594,6 +605,7 @@ def test_character_classes(): ), conf=_regexp_conf) +@datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10641") def test_regexp_choice(): gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}') assert_gpu_and_cpu_are_equal_collect( @@ -617,7 +629,7 @@ def test_regexp_hexadecimal_digits(): gen = mk_str_gen( '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]') assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen, length=10).selectExpr( + lambda spark: unary_op_df(spark, gen).selectExpr( 'rlike(a, "\\\\x7f")', 'rlike(a, "\\\\x80")', 'rlike(a, "[\\\\xa0-\\\\xf0]")', diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index e33548868f2..bf45fc153d7 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1060,6 +1060,7 @@ object RegexprPart { case object End extends RegexprPart // $ case object Wildcard extends RegexprPart // .* or (.*) case class Digits(from: Int, to: Int) extends RegexprPart // [0-9]{a, b} + case object Chinese extends RegexprPart // Chinese characters [\u4e00-\u9fa5]+ case class Fixstring(name: String) extends RegexprPart // normal string without special characters case class Regexpr(value: String) extends RegexprPart // other strings } @@ -1096,6 +1097,8 @@ class GpuRLikeMeta( Wildcard :: parseRegexToParts(s.substring(4)) case s if s.endsWith("(.*)") => parseRegexToParts(s.substring(0, s.length - 4)) :+ Wildcard + case s if s.startsWith("[\u4e00-\u9fa5]+") => + parseRegexToParts(s.substring(0, s.length - 6)) :+ Chinese case s if s.endsWith("([0-9]{5})") => parseRegexToParts(s.substring(0, s.length - 10)) :+ Digits(5, 5) case s if s.endsWith("[0-9]{4,}") => @@ -1125,15 +1128,20 @@ class GpuRLikeMeta( case Fixstring(s) :: List(End) => { GpuEndsWith(lhs, GpuLiteral(s, StringType)) } + case Chinese :: rest + if rest == List() || rest.forall(_ == Wildcard) => { + // println(s"!!!GpuStringDigits chinese") + GpuStringDigits(lhs, GpuLiteral("", StringType), 1, 19968, 40869) + } case Digits(from, _) :: rest if rest == List() || rest.forall(_ == Wildcard) => { - // println(s"!!!GpuStringDigits1: $from") - GpuStringDigits(lhs, GpuLiteral("", StringType), from) + // println(s"!!!GpuStringDigits1") + GpuStringDigits(lhs, GpuLiteral("", StringType), from, 48, 57) } case Fixstring(s) :: Digits(from, _) :: rest if rest == List() || rest.forall(_ == Wildcard) => { - // println(s"!!!GpuStringDigits2: $s, $from") - GpuStringDigits(lhs, GpuLiteral(s, StringType), from) + // println(s"!!!GpuStringDigits2") + GpuStringDigits(lhs, GpuLiteral(s, StringType), from, 48, 57) } case Fixstring(s) :: rest if rest == List() || rest.forall(_ == Wildcard) => { @@ -1217,7 +1225,7 @@ class GpuRLikeMeta( } } -case class GpuStringDigits(left: Expression, right: Expression, from: Int) +case class GpuStringDigits(left: Expression, right: Expression, from: Int, start: Int, end: Int) extends GpuBinaryExpressionArgsAnyScalar with ImplicitCastInputTypes with NullIntolerant { override def dataType: DataType = BooleanType @@ -1225,7 +1233,7 @@ case class GpuStringDigits(left: Expression, right: Expression, from: Int) override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { - StringDigitsPattern.stringDigitsPattern(lhs.getBase, rhs.getBase, from) + StringDigitsPattern.stringDigitsPattern(lhs.getBase, rhs.getBase, from, start, end) } override def doColumnar(numRows: Int, lhs: GpuScalar, rhs: GpuScalar): ColumnVector = {