Skip to content

Commit

Permalink
Improve JSON scan and from_json (#11702)
Browse files Browse the repository at this point in the history
* Add test

Signed-off-by: Nghia Truong <[email protected]>

* Add test `test_from_json_map_with_options`

Signed-off-by: Nghia Truong <[email protected]>

* Pass down JSON options to JNI

Signed-off-by: Nghia Truong <[email protected]>

* Add support for `allowSingleQuotes==false`

Signed-off-by: Nghia Truong <[email protected]>

* Fix tests

Signed-off-by: Nghia Truong <[email protected]>

* Update test

Signed-off-by: Nghia Truong <[email protected]>

* Update integration_tests/src/main/python/json_test.py

---------

Signed-off-by: Nghia Truong <[email protected]>
  • Loading branch information
ttnghia authored Nov 14, 2024
1 parent 57b8caa commit a8010cc
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 10 deletions.
6 changes: 2 additions & 4 deletions integration_tests/src/main/python/json_matrix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,18 @@ def test_json_tuple_allow_comments_off(std_input_path):
@allow_non_gpu('FileSourceScanExec')
@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
def test_scan_json_allow_single_quotes_off(std_input_path, read_func, spark_tmp_table_factory):
assert_gpu_fallback_collect(
assert_gpu_and_cpu_are_equal_collect(
read_func(std_input_path + '/' + WITH_SQ_FILE,
WITH_SQ_SCHEMA,
spark_tmp_table_factory,
{"allowSingleQuotes": "false"}),
'FileSourceScanExec',
conf=_enable_all_types_json_scan_conf)

@allow_non_gpu('ProjectExec', TEXT_INPUT_EXEC)
def test_from_json_allow_single_quotes_off(std_input_path):
schema = WITH_SQ_SCHEMA
assert_gpu_fallback_collect(
assert_gpu_and_cpu_are_equal_collect(
lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "false"})),
'JsonToStructs',
conf =_enable_json_to_structs_conf)

# On is the default so it really needs to work
Expand Down
47 changes: 47 additions & 0 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,53 @@ def test_from_json_map():
.select(f.from_json(f.col('a'), 'MAP<STRING,STRING>')),
conf=_enable_all_types_conf)

@allow_non_gpu(*non_utc_allow)
def test_from_json_map_with_invalid():
# The test here is working around some inconsistencies in how the keys are parsed for maps
# on the GPU the keys are dense, but on the CPU they are sparse
json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"(, "b": "[A-Z]{0,5}")?}') \
.with_special_pattern('', weight=50) \
.with_special_pattern(' ', weight=50) \
.with_special_pattern('null', weight=50) \
.with_special_pattern('invalid', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}"', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}"}abc', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}"}{"b": "B"}', weight=50)
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, json_string_gen) \
.select(f.from_json(f.col('a'), 'MAP<STRING,STRING>')),
conf=_enable_all_types_conf)

@allow_non_gpu(*non_utc_allow)
@pytest.mark.parametrize('allow_single_quotes', ['true', 'false'])
@pytest.mark.parametrize('allow_non_numeric_numbers', ['true', 'false'])
@pytest.mark.parametrize('allow_unquoted_chars', ['true', 'false'])
def test_from_json_map_with_options(allow_single_quotes,
allow_non_numeric_numbers,
allow_unquoted_chars):
# Test the input with:
# - Double quotes
# - Single quotes
# - Numbers with leading zeros
# - Non-numeric numbers
# - Unquoted control characters in quoted strings
json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"}') \
.with_special_pattern(r"""{'a': "[0-9]{0,5}"}""", weight=50) \
.with_special_pattern(r'{"a": 0[0-9]{0,5}}', weight=50) \
.with_special_pattern(r'{"a": [+-]?(INF|Infinity|NaN)}', weight=50) \
.with_special_pattern(r'{"(a|a\r\n\tb)": "(xyz|01\r\n\t23)"}', weight=50)
options = {"allowSingleQuotes": allow_single_quotes,
# Cannot test `allowNumericLeadingZeros==true` because the GPU output always has
# leading zeros while the CPU output does not, thus test will always fail.
"allowNumericLeadingZeros": "false",
"allowNonNumericNumbers": allow_non_numeric_numbers,
"allowUnquotedControlChars": allow_unquoted_chars}
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, json_string_gen, length=20) \
.select(f.from_json(f.col('a'), 'MAP<STRING,STRING>', options)),
conf=_enable_all_types_conf)

@allow_non_gpu('ProjectExec', 'JsonToStructs')
def test_from_json_map_fallback():
# The test here is working around some inconsistencies in how the keys are parsed for maps
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,6 @@ object GpuJsonScan {
meta.willNotWorkOnGpu(s"$op does not support allowUnquotedFieldNames")
}

// {'name': 'Reynold Xin'} turning single quotes off is not supported by CUDF
if (!options.allowSingleQuotes) {
meta.willNotWorkOnGpu(s"$op does not support disabling allowSingleQuotes")
}

// {"name": "Cazen Lee", "price": "\$10"} is not supported by CUDF
if (options.allowBackslashEscapingAnyCharacter) {
meta.willNotWorkOnGpu(s"$op does not support allowBackslashEscapingAnyCharacter")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ case class GpuJsonToStructs(
override protected def doColumnar(input: GpuColumnVector): cudf.ColumnVector = {
withResource(new NvtxRange("GpuJsonToStructs", NvtxColor.YELLOW)) { _ =>
schema match {
case _: MapType => JSONUtils.extractRawMapFromJsonString(input.getBase)
case _: MapType =>
JSONUtils.extractRawMapFromJsonString(input.getBase, jsonOptionBuilder.build())
case struct: StructType =>
// if we ever need to support duplicate keys we need to keep track of the duplicates
// and make the first one null, but I don't think this will ever happen in practice
Expand Down

0 comments on commit a8010cc

Please sign in to comment.