Skip to content

Commit

Permalink
Add java option to keep quotes for JSON reads (rapidsai#15146)
Browse files Browse the repository at this point in the history
Plumbs through the option to enable returning quotes with strings when reading JSON.

Authors:
   - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
   - Jason Lowe (https://github.com/jlowe)
   - Bradley Dice (https://github.com/bdice)
  • Loading branch information
revans2 authored Feb 28, 2024
1 parent 1719cda commit ab2eb58
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 12 deletions.
17 changes: 17 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean recoverWithNull;
private final boolean normalizeSingleQuotes;
private final boolean mixedTypesAsStrings;
private final boolean keepStringQuotes;

private JSONOptions(Builder builder) {
super(builder);
Expand All @@ -40,6 +41,7 @@ private JSONOptions(Builder builder) {
recoverWithNull = builder.recoverWithNull;
normalizeSingleQuotes = builder.normalizeSingleQuotes;
mixedTypesAsStrings = builder.mixedTypesAsStrings;
keepStringQuotes = builder.keepQuotes;
}

public boolean isDayFirst() {
Expand All @@ -63,6 +65,10 @@ public boolean isMixedTypesAsStrings() {
return mixedTypesAsStrings;
}

public boolean keepStringQuotes() {
return keepStringQuotes;
}

@Override
String[] getIncludeColumnNames() {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand All @@ -80,6 +86,7 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean normalizeSingleQuotes = false;

private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

/**
* Whether to parse dates as DD/MM versus MM/DD
Expand Down Expand Up @@ -135,6 +142,16 @@ public Builder withMixedTypesAsStrings(boolean mixedTypesAsStrings) {
return this;
}

/**
* Set whether the reader should keep quotes of string values.
* @param keepQuotes true to keep them, else false.
* @return this for chaining.
*/
public Builder withKeepQuotes(boolean keepQuotes) {
this.keepQuotes = keepQuotes;
return this;
}

@Override
public Builder includeColumn(String... names) {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand Down
22 changes: 16 additions & 6 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -252,23 +252,31 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean dayFirst, boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings) throws CudfException;
boolean mixedTypesAsStrings,
boolean keepStringQuotes) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
boolean dayFirst, boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings,
boolean keepStringQuotes,
long dsHandle) throws CudfException;

private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings,
boolean keepStringQuotes,
long dsHandle) throws CudfException;
private static native long readAndInferJSON(long address, long length,
boolean dayFirst, boolean lines, boolean recoverWithNulls, boolean normalizeSingleQuotes, boolean mixedTypesAsStrings) throws CudfException;
boolean dayFirst,
boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings,
boolean keepStringQuotes) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1246,7 +1254,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
0, 0,
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings()))) {
opts.isMixedTypesAsStrings(),
opts.keepStringQuotes()))) {

return gatherJSONColumns(schema, twm);
}
Expand Down Expand Up @@ -1300,7 +1309,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings()));
opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
}

/**
Expand All @@ -1316,6 +1325,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
opts.isRecoverWithNull(),
opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings(),
opts.keepStringQuotes(),
dsHandle));
return twm;
} finally {
Expand Down Expand Up @@ -1345,7 +1355,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings()))) {
opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
return gatherJSONColumns(schema, twm);
}
}
Expand All @@ -1362,7 +1372,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings(), dsHandle))) {
opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
return gatherJSONColumns(schema, twm);
} finally {
DataSourceHelper.destroyWrapperDataSource(dsHandle);
Expand Down
19 changes: 13 additions & 6 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1429,7 +1429,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) {
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
jlong ds_handle) {

JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);

Expand All @@ -1447,6 +1448,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.keep_quotes(keep_quotes)
.mixed_types_as_string(mixed_types_as_string);

auto result =
Expand All @@ -1459,7 +1461,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) {
jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string,
jboolean keep_quotes) {

JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1481,6 +1484,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.keep_quotes(keep_quotes)
.mixed_types_as_string(mixed_types_as_string);

auto result =
Expand Down Expand Up @@ -1569,7 +1573,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) {
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
jlong ds_handle) {

JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);

Expand Down Expand Up @@ -1601,7 +1606,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.mixed_types_as_string(mixed_types_as_string);
.mixed_types_as_string(mixed_types_as_string)
.keep_quotes(keep_quotes);

if (!n_types.is_null()) {
if (n_types.size() != n_scales.size()) {
Expand Down Expand Up @@ -1640,7 +1646,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
jboolean day_first, jboolean lines, jboolean recover_with_null,
jboolean normalize_single_quotes, jboolean mixed_types_as_string) {
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) {

bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1687,7 +1693,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.mixed_types_as_string(mixed_types_as_string);
.mixed_types_as_string(mixed_types_as_string)
.keep_quotes(keep_quotes);

if (!n_types.is_null()) {
if (n_types.size() != n_scales.size()) {
Expand Down
19 changes: 19 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,25 @@ void testReadSingleQuotesJSONFile() throws IOException {
}
}

@Test
void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
Schema schema = Schema.builder()
.column(DType.STRING, "A")
.build();
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withNormalizeSingleQuotes(true)
.withKeepQuotes(true)
.build();
try (Table expected = new Table.TestBuilder()
.column("\"TEST\"\"", "\"TESTER'\"") // Note that escapes are also processed
.build();
MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE);
Table table = Table.readJSON(schema, opts, source)) {
assertTablesAreEqual(expected, table);
}
}

private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
"{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
"{\"d\":[1,2,3]}\n" +
Expand Down

0 comments on commit ab2eb58

Please sign in to comment.