Improve JSON scan and from_json (#11702)

* Add test Signed-off-by: Nghia Truong <[email protected]> * Add test `test_from_json_map_with_options` Signed-off-by: Nghia Truong <[email protected]> * Pass down JSON options to JNI Signed-off-by: Nghia Truong <[email protected]> * Add support for `allowSingleQuotes==false` Signed-off-by: Nghia Truong <[email protected]> * Fix tests Signed-off-by: Nghia Truong <[email protected]> * Update test Signed-off-by: Nghia Truong <[email protected]> * Update integration_tests/src/main/python/json_test.py --------- Signed-off-by: Nghia Truong <[email protected]>
NVIDIA · Nov 14, 2024 · a8010cc · a8010cc
1 parent 57b8caa
commit a8010cc
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 10 deletions.
diff --git a/integration_tests/src/main/python/json_matrix_test.py b/integration_tests/src/main/python/json_matrix_test.py
@@ -123,20 +123,18 @@ def test_json_tuple_allow_comments_off(std_input_path):
 @allow_non_gpu('FileSourceScanExec')
 @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
 def test_scan_json_allow_single_quotes_off(std_input_path, read_func, spark_tmp_table_factory):
-    assert_gpu_fallback_collect(
+    assert_gpu_and_cpu_are_equal_collect(
         read_func(std_input_path + '/' + WITH_SQ_FILE,
         WITH_SQ_SCHEMA,
         spark_tmp_table_factory,
         {"allowSingleQuotes": "false"}),
-        'FileSourceScanExec',
         conf=_enable_all_types_json_scan_conf)
 
 @allow_non_gpu('ProjectExec', TEXT_INPUT_EXEC)
 def test_from_json_allow_single_quotes_off(std_input_path):
     schema = WITH_SQ_SCHEMA
-    assert_gpu_fallback_collect(
+    assert_gpu_and_cpu_are_equal_collect(
         lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "false"})),
-        'JsonToStructs',
         conf =_enable_json_to_structs_conf)
 
 # On is the default so it really needs to work

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -679,6 +679,53 @@ def test_from_json_map():
             .select(f.from_json(f.col('a'), 'MAP<STRING,STRING>')),
         conf=_enable_all_types_conf)
 
+@allow_non_gpu(*non_utc_allow)
+def test_from_json_map_with_invalid():
+    # The test here is working around some inconsistencies in how the keys are parsed for maps
+    # on the GPU the keys are dense, but on the CPU they are sparse
+    json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"(, "b": "[A-Z]{0,5}")?}') \
+        .with_special_pattern('', weight=50) \
+        .with_special_pattern('  ', weight=50) \
+        .with_special_pattern('null', weight=50) \
+        .with_special_pattern('invalid', weight=50) \
+        .with_special_pattern(r'{"a": "[0-9]{0,5}"', weight=50) \
+        .with_special_pattern(r'{"a": "[0-9]{0,5}', weight=50) \
+        .with_special_pattern(r'{"a": "[0-9]{0,5}"}abc', weight=50) \
+        .with_special_pattern(r'{"a": "[0-9]{0,5}"}{"b": "B"}', weight=50)
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : unary_op_df(spark, json_string_gen) \
+            .select(f.from_json(f.col('a'), 'MAP<STRING,STRING>')),
+        conf=_enable_all_types_conf)
+
+@allow_non_gpu(*non_utc_allow)
+@pytest.mark.parametrize('allow_single_quotes', ['true', 'false'])
+@pytest.mark.parametrize('allow_non_numeric_numbers', ['true', 'false'])
+@pytest.mark.parametrize('allow_unquoted_chars', ['true', 'false'])
+def test_from_json_map_with_options(allow_single_quotes,  
+                                    allow_non_numeric_numbers,
+                                    allow_unquoted_chars):
+    # Test the input with:
+    #  - Double quotes
+    #  - Single quotes
+    #  - Numbers with leading zeros
+    #  - Non-numeric numbers
+    #  - Unquoted control characters in quoted strings
+    json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"}') \
+        .with_special_pattern(r"""{'a': "[0-9]{0,5}"}""", weight=50) \
+        .with_special_pattern(r'{"a": 0[0-9]{0,5}}', weight=50) \
+        .with_special_pattern(r'{"a": [+-]?(INF|Infinity|NaN)}', weight=50) \
+        .with_special_pattern(r'{"(a|a\r\n\tb)": "(xyz|01\r\n\t23)"}', weight=50)
+    options = {"allowSingleQuotes": allow_single_quotes,
+                # Cannot test `allowNumericLeadingZeros==true` because the GPU output always has
+                # leading zeros while the CPU output does not, thus test will always fail.
+               "allowNumericLeadingZeros": "false",
+               "allowNonNumericNumbers": allow_non_numeric_numbers,
+               "allowUnquotedControlChars": allow_unquoted_chars}
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : unary_op_df(spark, json_string_gen, length=20) \
+            .select(f.from_json(f.col('a'), 'MAP<STRING,STRING>', options)),
+        conf=_enable_all_types_conf)
+
 @allow_non_gpu('ProjectExec', 'JsonToStructs')
 def test_from_json_map_fallback():
     # The test here is working around some inconsistencies in how the keys are parsed for maps

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala
@@ -90,11 +90,6 @@ object GpuJsonScan {
       meta.willNotWorkOnGpu(s"$op does not support allowUnquotedFieldNames")
     }
 
-    // {'name': 'Reynold Xin'} turning single quotes off is not supported by CUDF
-    if (!options.allowSingleQuotes) {
-      meta.willNotWorkOnGpu(s"$op does not support disabling allowSingleQuotes")
-    }
-
     // {"name": "Cazen Lee", "price": "\$10"} is not supported by CUDF
     if (options.allowBackslashEscapingAnyCharacter) {
       meta.willNotWorkOnGpu(s"$op does not support allowBackslashEscapingAnyCharacter")

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala
@@ -84,7 +84,8 @@ case class GpuJsonToStructs(
   override protected def doColumnar(input: GpuColumnVector): cudf.ColumnVector = {
     withResource(new NvtxRange("GpuJsonToStructs", NvtxColor.YELLOW)) { _ =>
       schema match {
-        case _: MapType => JSONUtils.extractRawMapFromJsonString(input.getBase)
+        case _: MapType =>
+          JSONUtils.extractRawMapFromJsonString(input.getBase, jsonOptionBuilder.build())
         case struct: StructType =>
           // if we ever need to support duplicate keys we need to keep track of the duplicates
           //  and make the first one null, but I don't think this will ever happen in practice