From ab2eb58be36e1140157e61aa65838670d97820b7 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 28 Feb 2024 08:49:44 -0600 Subject: [PATCH] Add java option to keep quotes for JSON reads (#15146) Plumbs through the option to enable returning quotes with strings when reading JSON. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Jason Lowe (https://github.com/jlowe) - Bradley Dice (https://github.com/bdice) --- .../main/java/ai/rapids/cudf/JSONOptions.java | 17 ++++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 22 ++++++++++++++----- java/src/main/native/src/TableJni.cpp | 19 +++++++++++----- .../test/java/ai/rapids/cudf/TableTest.java | 19 ++++++++++++++++ 4 files changed, 65 insertions(+), 12 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 35165c18c7a..62496e32f7a 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -32,6 +32,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean recoverWithNull; private final boolean normalizeSingleQuotes; private final boolean mixedTypesAsStrings; + private final boolean keepStringQuotes; private JSONOptions(Builder builder) { super(builder); @@ -40,6 +41,7 @@ private JSONOptions(Builder builder) { recoverWithNull = builder.recoverWithNull; normalizeSingleQuotes = builder.normalizeSingleQuotes; mixedTypesAsStrings = builder.mixedTypesAsStrings; + keepStringQuotes = builder.keepQuotes; } public boolean isDayFirst() { @@ -63,6 +65,10 @@ public boolean isMixedTypesAsStrings() { return mixedTypesAsStrings; } + public boolean keepStringQuotes() { + return keepStringQuotes; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -80,6 +86,7 @@ public static final class Builder extends ColumnFilterOptions.Builder(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .keep_quotes(keep_quotes) .mixed_types_as_string(mixed_types_as_string); auto result = @@ -1459,7 +1461,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) { + jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string, + jboolean keep_quotes) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1481,6 +1484,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .keep_quotes(keep_quotes) .mixed_types_as_string(mixed_types_as_string); auto result = @@ -1569,7 +1573,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) { + jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes, + jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1601,7 +1606,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .mixed_types_as_string(mixed_types_as_string); + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { @@ -1640,7 +1646,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string) { + jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) { bool read_buffer = true; if (buffer == 0) { @@ -1687,7 +1693,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .mixed_types_as_string(mixed_types_as_string); + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index e270c4a5183..efdb6f4bb1b 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -349,6 +349,25 @@ void testReadSingleQuotesJSONFile() throws IOException { } } + @Test + void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(true) + .withKeepQuotes(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("\"TEST\"\"", "\"TESTER'\"") // Note that escapes are also processed + .build(); + MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" +