From dc45aeedde006459be7c1382932c8e1071ae7168 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Nov 2023 11:28:14 -0600 Subject: [PATCH 1/7] escape quotes in when converting strings to json format --- integration_tests/src/main/python/json_test.py | 10 ++++++++-- .../main/scala/com/nvidia/spark/rapids/GpuCast.scala | 12 +++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 9efd6bcca4b..2c82766c324 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -575,8 +575,14 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): pytest.param(double_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9350')), pytest.param(date_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), pytest.param(timestamp_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), - StringGen('[A-Za-z0-9]{0,10}', nullable=True), - pytest.param(StringGen(nullable=True), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9514')), + StringGen('[A-Za-z0-9\'"\\\\]{0,10}', nullable=True) \ + .with_special_case('\u1f600') \ + .with_special_case('"a"') \ + .with_special_case('\\"a\\"') \ + .with_special_case('\'a\'') \ + .with_special_case('\\\'a\\\''), + pytest.param(StringGen('\u001a', nullable=True), marks=pytest.mark.xfail( + reason='cuDF represents two-digit unicode characters in hex format such as \x1a')) ], ids=idfn) @pytest.mark.parametrize('ignore_null_fields', [ True, diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 5cfde1c9d15..39e6563832f 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -1169,9 +1169,15 @@ object GpuCast { } private def escapeJsonString(cv: ColumnVector): ColumnVector = { - // this is a placeholder for implementing string escaping - // https://github.com/NVIDIA/spark-rapids/issues/9514 - cv + val chars = Seq("\\", "\"") + val escaped = chars.map("\\" + _) + withResource(ColumnVector.fromStrings(chars: _*)) { search => + withResource(ColumnVector.fromStrings(escaped: _*)) { replace => + withResource(cv) { _=> + cv.stringReplace(search, replace) + } + } + } } private[rapids] def castFloatingTypeToString(input: ColumnView): ColumnVector = { From e7d502ca03ccc3e886b7d0ccad9a8f79b1324056 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Nov 2023 11:40:16 -0600 Subject: [PATCH 2/7] move withResource earlier --- .../main/scala/com/nvidia/spark/rapids/GpuCast.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 39e6563832f..c1f92dc9c2f 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -1169,11 +1169,11 @@ object GpuCast { } private def escapeJsonString(cv: ColumnVector): ColumnVector = { - val chars = Seq("\\", "\"") - val escaped = chars.map("\\" + _) - withResource(ColumnVector.fromStrings(chars: _*)) { search => - withResource(ColumnVector.fromStrings(escaped: _*)) { replace => - withResource(cv) { _=> + withResource(cv) { _=> + val chars = Seq("\\", "\"") + val escaped = chars.map("\\" + _) + withResource(ColumnVector.fromStrings(chars: _*)) { search => + withResource(ColumnVector.fromStrings(escaped: _*)) { replace => cv.stringReplace(search, replace) } } From 1682d6ab89b20071acb320c7fc61d3fa7c390bf6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Nov 2023 11:41:16 -0600 Subject: [PATCH 3/7] signoff Signed-off-by: Andy Grove From e31fd6ec87f225aeb21fc73a76b3a977debdd42c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Nov 2023 11:41:45 -0600 Subject: [PATCH 4/7] update compatibility guide --- docs/compatibility.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index b05b8d072f2..8fb95667a64 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -342,8 +342,6 @@ with Spark, and can be enabled by setting `spark.rapids.sql.expression.StructsTo Known issues are: -- String escaping is not implemented, so strings containing quotes, newlines, and other special characters will - not produce valid JSON - There is no support for timestamp types - There can be rounding differences when formatting floating-point numbers as strings. For example, Spark may produce `-4.1243574E26` but the GPU may produce `-4.124357351E26`. From 3aadf3b71aa9e323a53ebf0aa86b2dd6143a19fc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Nov 2023 12:40:15 -0600 Subject: [PATCH 5/7] Escape newlines --- integration_tests/src/main/python/json_test.py | 2 +- .../src/main/scala/com/nvidia/spark/rapids/GpuCast.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 2c82766c324..ae9c63ee811 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -575,7 +575,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): pytest.param(double_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9350')), pytest.param(date_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), pytest.param(timestamp_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), - StringGen('[A-Za-z0-9\'"\\\\]{0,10}', nullable=True) \ + StringGen('[A-Za-z0-9\r\n\'"\\\\]{0,10}', nullable=True) \ .with_special_case('\u1f600') \ .with_special_case('"a"') \ .with_special_case('\\"a\\"') \ diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index c1f92dc9c2f..5857eb53341 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -1170,8 +1170,8 @@ object GpuCast { private def escapeJsonString(cv: ColumnVector): ColumnVector = { withResource(cv) { _=> - val chars = Seq("\\", "\"") - val escaped = chars.map("\\" + _) + val chars = Seq("\r", "\n", "\\", "\"") + val escaped = Seq("\\r", "\\n", "\\\\", "\\\"") withResource(ColumnVector.fromStrings(chars: _*)) { search => withResource(ColumnVector.fromStrings(escaped: _*)) { replace => cv.stringReplace(search, replace) From 82ee057ccf833ee50e9f4b2b0613b83ed8663645 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Nov 2023 11:39:54 -0600 Subject: [PATCH 6/7] address feedback --- .../com/nvidia/spark/rapids/GpuCast.scala | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 5857eb53341..956a188f6f3 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -898,7 +898,10 @@ object GpuCast { val numRows = input.getRowCount.toInt - /** Create a new column with quotes around the supplied string column */ + /** + * Create a new column with quotes around the supplied string column. Caller + * is responsible for closing `column`. + */ def addQuotes(column: ColumnVector, rowCount: Int): ColumnVector = { withResource(ArrayBuffer.empty[ColumnVector]) { columns => withResource(Scalar.fromString("\"")) { quote => @@ -921,7 +924,7 @@ object GpuCast { // keys must have quotes around them in JSON mode val strKey: ColumnVector = withResource(kvStructColumn.getChildColumnView(0)) { keyColumn => withResource(castToString(keyColumn, from.keyType, options)) { key => - addQuotes(key.incRefCount(), keyColumn.getRowCount.toInt) + addQuotes(key, keyColumn.getRowCount.toInt) } } // string values must have quotes around them in JSON mode, and null values need @@ -930,7 +933,7 @@ object GpuCast { withResource(kvStructColumn.getChildColumnView(1)) { valueColumn => val valueStr = if (valueColumn.getType == DType.STRING) { withResource(castToString(valueColumn, from.valueType, options)) { valueStr => - addQuotes(valueStr.incRefCount(), valueColumn.getRowCount.toInt) + addQuotes(valueStr, valueColumn.getRowCount.toInt) } } else { castToString(valueColumn, from.valueType, options) @@ -1107,13 +1110,15 @@ object GpuCast { attrColumns += colon.incRefCount() } // write the value - val attrValue = castToString(cv, inputSchema(fieldIndex).dataType, options) if (needsQuoting) { attrColumns += quote.incRefCount() - attrColumns += escapeJsonString(attrValue) + withResource(castToString(cv, inputSchema(fieldIndex).dataType, options)) { + attrValue => + attrColumns += escapeJsonString(attrValue) + } attrColumns += quote.incRefCount() } else { - attrColumns += attrValue + attrColumns += castToString(cv, inputSchema(fieldIndex).dataType, options) } // now concatenate val jsonAttr = withResource(Scalar.fromString("")) { emptyString => @@ -1168,14 +1173,15 @@ object GpuCast { } } + /** + * Escape quotes and newlines in a string column. Caller is responsible for closing `cv`. + */ private def escapeJsonString(cv: ColumnVector): ColumnVector = { - withResource(cv) { _=> - val chars = Seq("\r", "\n", "\\", "\"") - val escaped = Seq("\\r", "\\n", "\\\\", "\\\"") - withResource(ColumnVector.fromStrings(chars: _*)) { search => - withResource(ColumnVector.fromStrings(escaped: _*)) { replace => - cv.stringReplace(search, replace) - } + val chars = Seq("\r", "\n", "\\", "\"") + val escaped = chars.map(StringEscapeUtils.escapeJava) + withResource(ColumnVector.fromStrings(chars: _*)) { search => + withResource(ColumnVector.fromStrings(escaped: _*)) { replace => + cv.stringReplace(search, replace) } } } From c55b5d344cd6be22fc6e31ce3319184586816c98 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Nov 2023 08:48:59 -0700 Subject: [PATCH 7/7] add link to issue --- integration_tests/src/main/python/json_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index b1d7ba3714c..5b7cee85440 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -621,7 +621,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): .with_special_case('\'a\'') \ .with_special_case('\\\'a\\\''), pytest.param(StringGen('\u001a', nullable=True), marks=pytest.mark.xfail( - reason='cuDF represents two-digit unicode characters in hex format such as \x1a')) + reason='https://github.com/NVIDIA/spark-rapids/issues/9705')) ], ids=idfn) @pytest.mark.parametrize('ignore_null_fields', [True, False]) @pytest.mark.parametrize('pretty', [