From 36baf45c565deb389a82f06116f0632913b73145 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Nov 2023 16:39:03 -0700 Subject: [PATCH] Escape quotes and newlines when converting strings to json format in to_json (#9612) * escape quotes in when converting strings to json format * move withResource earlier * signoff Signed-off-by: Andy Grove * update compatibility guide * Escape newlines * address feedback * add link to issue --------- Signed-off-by: Andy Grove --- docs/compatibility.md | 2 -- .../src/main/python/json_test.py | 10 ++++++-- .../com/nvidia/spark/rapids/GpuCast.scala | 24 +++++++++++++------ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 8d18d8b57ca..ac90d309fe1 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -346,8 +346,6 @@ with Spark, and can be enabled by setting `spark.rapids.sql.expression.StructsTo Known issues are: -- String escaping is not implemented, so strings containing quotes, newlines, and other special characters will - not produce valid JSON - There is no support for timestamp types - There can be rounding differences when formatting floating-point numbers as strings. For example, Spark may produce `-4.1243574E26` but the GPU may produce `-4.124357351E26`. diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 043349ce54e..5b7cee85440 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -614,8 +614,14 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): pytest.param(double_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9350')), pytest.param(date_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), pytest.param(timestamp_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), - StringGen('[A-Za-z0-9]{0,10}', nullable=True), - pytest.param(StringGen(nullable=True), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9514')), + StringGen('[A-Za-z0-9\r\n\'"\\\\]{0,10}', nullable=True) \ + .with_special_case('\u1f600') \ + .with_special_case('"a"') \ + .with_special_case('\\"a\\"') \ + .with_special_case('\'a\'') \ + .with_special_case('\\\'a\\\''), + pytest.param(StringGen('\u001a', nullable=True), marks=pytest.mark.xfail( + reason='https://github.com/NVIDIA/spark-rapids/issues/9705')) ], ids=idfn) @pytest.mark.parametrize('ignore_null_fields', [True, False]) @pytest.mark.parametrize('pretty', [ diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 31752d482c3..6634c946d47 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -899,7 +899,10 @@ object GpuCast { val numRows = input.getRowCount.toInt - /** Create a new column with quotes around the supplied string column */ + /** + * Create a new column with quotes around the supplied string column. Caller + * is responsible for closing `column`. + */ def addQuotes(column: ColumnVector, rowCount: Int): ColumnVector = { withResource(ArrayBuffer.empty[ColumnVector]) { columns => withResource(Scalar.fromString("\"")) { quote => @@ -922,7 +925,7 @@ object GpuCast { // keys must have quotes around them in JSON mode val strKey: ColumnVector = withResource(kvStructColumn.getChildColumnView(0)) { keyColumn => withResource(castToString(keyColumn, from.keyType, options)) { key => - addQuotes(key.incRefCount(), keyColumn.getRowCount.toInt) + addQuotes(key, keyColumn.getRowCount.toInt) } } // string values must have quotes around them in JSON mode, and null values need @@ -931,7 +934,7 @@ object GpuCast { withResource(kvStructColumn.getChildColumnView(1)) { valueColumn => val valueStr = if (valueColumn.getType == DType.STRING) { withResource(castToString(valueColumn, from.valueType, options)) { valueStr => - addQuotes(valueStr.incRefCount(), valueColumn.getRowCount.toInt) + addQuotes(valueStr, valueColumn.getRowCount.toInt) } } else { castToString(valueColumn, from.valueType, options) @@ -1136,7 +1139,7 @@ object GpuCast { attrValue => if (needsQuoting) { attrValues += quote.incRefCount() - attrValues += escapeJsonString(attrValue.incRefCount()) + attrValues += escapeJsonString(attrValue) attrValues += quote.incRefCount() withResource(Scalar.fromString("")) { emptyString => ColumnVector.stringConcatenate(emptyString, emptyString, attrValues.toArray) @@ -1199,10 +1202,17 @@ object GpuCast { } } + /** + * Escape quotes and newlines in a string column. Caller is responsible for closing `cv`. + */ private def escapeJsonString(cv: ColumnVector): ColumnVector = { - // this is a placeholder for implementing string escaping - // https://github.com/NVIDIA/spark-rapids/issues/9514 - cv + val chars = Seq("\r", "\n", "\\", "\"") + val escaped = chars.map(StringEscapeUtils.escapeJava) + withResource(ColumnVector.fromStrings(chars: _*)) { search => + withResource(ColumnVector.fromStrings(escaped: _*)) { replace => + cv.stringReplace(search, replace) + } + } } private[rapids] def castFloatingTypeToString(input: ColumnView): ColumnVector = {