NVIDIA · andygrove · Nov 14, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -342,8 +342,6 @@ with Spark, and can be enabled by setting `spark.rapids.sql.expression.StructsTo
 
 Known issues are:
 
-- String escaping is not implemented, so strings containing quotes, newlines, and other special characters will 
-  not produce valid JSON
 - There is no support for timestamp types
 - There can be rounding differences when formatting floating-point numbers as strings. For example, Spark may
   produce `-4.1243574E26` but the GPU may produce `-4.124357351E26`.

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -575,8 +575,14 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name):
     pytest.param(double_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9350')),
     pytest.param(date_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')),
     pytest.param(timestamp_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')),
-    StringGen('[A-Za-z0-9]{0,10}', nullable=True),
-    pytest.param(StringGen(nullable=True), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9514')),
+    StringGen('[A-Za-z0-9\r\n\'"\\\\]{0,10}', nullable=True) \
+        .with_special_case('\u1f600') \
+        .with_special_case('"a"') \
+        .with_special_case('\\"a\\"') \
+        .with_special_case('\'a\'') \
+        .with_special_case('\\\'a\\\''),
+    pytest.param(StringGen('\u001a', nullable=True), marks=pytest.mark.xfail(
+        reason='cuDF represents two-digit unicode characters in hex format such as \x1a'))
 ], ids=idfn)
 @pytest.mark.parametrize('ignore_null_fields', [
     True,

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
@@ -898,7 +898,10 @@ object GpuCast {
 
     val numRows = input.getRowCount.toInt
 
-    /** Create a new column with quotes around the supplied string column */
+    /**
+     * Create a new column with quotes around the supplied string column. Caller
+     * is responsible for closing `column`.
+     */
     def addQuotes(column: ColumnVector, rowCount: Int): ColumnVector = {
       withResource(ArrayBuffer.empty[ColumnVector]) { columns =>
         withResource(Scalar.fromString("\"")) { quote =>
@@ -921,7 +924,7 @@ object GpuCast {
         // keys must have quotes around them in JSON mode
         val strKey: ColumnVector = withResource(kvStructColumn.getChildColumnView(0)) { keyColumn =>
           withResource(castToString(keyColumn, from.keyType, options)) { key =>
-            addQuotes(key.incRefCount(), keyColumn.getRowCount.toInt)
+            addQuotes(key, keyColumn.getRowCount.toInt)
           }
         }
         // string values must have quotes around them in JSON mode, and null values need
@@ -930,7 +933,7 @@ object GpuCast {
           withResource(kvStructColumn.getChildColumnView(1)) { valueColumn =>
             val valueStr = if (valueColumn.getType == DType.STRING) {
               withResource(castToString(valueColumn, from.valueType, options)) { valueStr =>
-                addQuotes(valueStr.incRefCount(), valueColumn.getRowCount.toInt)
+                addQuotes(valueStr, valueColumn.getRowCount.toInt)
               }
             } else {
               castToString(valueColumn, from.valueType, options)
@@ -1107,13 +1110,15 @@ object GpuCast {
             attrColumns += colon.incRefCount()
           }
           // write the value
-          val attrValue = castToString(cv, inputSchema(fieldIndex).dataType, options)
           if (needsQuoting) {
             attrColumns += quote.incRefCount()
-            attrColumns += escapeJsonString(attrValue)
+            withResource(castToString(cv, inputSchema(fieldIndex).dataType, options)) {
+                attrValue =>
+              attrColumns += escapeJsonString(attrValue)
+            }
             attrColumns += quote.incRefCount()
           } else {
-            attrColumns += attrValue
+            attrColumns += castToString(cv, inputSchema(fieldIndex).dataType, options)
           }
           // now concatenate
           val jsonAttr = withResource(Scalar.fromString("")) { emptyString =>
@@ -1168,10 +1173,17 @@ object GpuCast {
     }
   }
 
+  /**
+   * Escape quotes and newlines in a string column. Caller is responsible for closing `cv`.
+   */
   private def escapeJsonString(cv: ColumnVector): ColumnVector = {
-    // this is a placeholder for implementing string escaping
-    // https://github.com/NVIDIA/spark-rapids/issues/9514
-    cv
+    val chars = Seq("\r", "\n", "\\", "\"")
+    val escaped = chars.map(StringEscapeUtils.escapeJava)
+    withResource(ColumnVector.fromStrings(chars: _*)) { search =>
+      withResource(ColumnVector.fromStrings(escaped: _*)) { replace =>
+        cv.stringReplace(search, replace)
+      }
+    }
   }
 
   private[rapids] def castFloatingTypeToString(input: ColumnView): ColumnVector = {