Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add in option for Java JSON APIs to do column pruning in CUDF #16796

Merged
merged 5 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean allowLeadingZeros;
private final boolean allowNonNumericNumbers;
private final boolean allowUnquotedControlChars;
private final boolean cudfPruneSchema;

private JSONOptions(Builder builder) {
super(builder);
Expand All @@ -52,6 +53,11 @@ private JSONOptions(Builder builder) {
allowLeadingZeros = builder.allowLeadingZeros;
allowNonNumericNumbers = builder.allowNonNumericNumbers;
allowUnquotedControlChars = builder.allowUnquotedControlChars;
cudfPruneSchema = builder.cudfPruneSchema;
}

public boolean shouldCudfPruneSchema() {
return cudfPruneSchema;
}

public boolean isDayFirst() {
Expand Down Expand Up @@ -123,6 +129,13 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

private boolean cudfPruneSchema = false;

public Builder withCudfPruneSchema(boolean prune) {
cudfPruneSchema = prune;
return this;
}

/**
* Should json validation be strict or not
*/
Expand Down
24 changes: 21 additions & 3 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl) throws CudfException;
boolean allowUnquotedControl,
boolean pruneColumns) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
Expand All @@ -272,6 +273,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
long dsHandle) throws CudfException;

private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
Expand Down Expand Up @@ -1308,6 +1310,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
* @return the file parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, File path) {
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
Comment on lines +1318 to +1319
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since getColumnNames calls toArray, consider storing it in a local variable. Maybe we could introduce a utils method since this PR does it a few times.

opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(
readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
Expand All @@ -1321,7 +1327,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {
opts.unquotedControlChars(),
cudfPruneSchema))) {

return gatherJSONColumns(schema, twm, -1);
}
Expand Down Expand Up @@ -1465,6 +1472,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert len > 0;
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSON(
schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
Expand All @@ -1479,7 +1490,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {
opts.unquotedControlChars(),
cudfPruneSchema)
)) {
return gatherJSONColumns(schema, twm, emptyRowCount);
}
}
Expand All @@ -1505,6 +1518,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
opts.isDayFirst(),
Expand All @@ -1518,6 +1535,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
dsHandle))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
} finally {
Expand Down
16 changes: 11 additions & 5 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1647,7 +1647,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(false);
abellina marked this conversation as resolved.
Show resolved Hide resolved
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1700,7 +1701,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.strict_validation(strict_validation)
.mixed_types_as_string(mixed_types_as_string)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(false);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1814,6 +1816,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jlong ds_handle)
{
JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
Expand Down Expand Up @@ -1849,7 +1852,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1908,7 +1912,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control)
jboolean allow_unquoted_control,
jboolean prune_columns)
{
bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1958,7 +1963,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down
Loading