Skip to content

Commit

Permalink
Add in option for Java JSON APIs to do column pruning in CUDF (#16796)
Browse files Browse the repository at this point in the history
This adds in the options to enable column_pruning when reading JSON using the java APIs.

This is still in draft because there are test failures if this is turned on for those tests.

#16797

That said the performance impact from enabling column pruning on some queries is huge. For one query in particular the current code takes 161.5 seconds and with CUDF column pruning it is just 16.5 seconds. That is a 10x speedup for something that is fairly real world.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)

URL: #16796
  • Loading branch information
revans2 authored Sep 24, 2024
1 parent 6badd6b commit b3518ab
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 3 deletions.
12 changes: 12 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean allowLeadingZeros;
private final boolean allowNonNumericNumbers;
private final boolean allowUnquotedControlChars;
private final boolean cudfPruneSchema;
private final byte lineDelimiter;

private JSONOptions(Builder builder) {
Expand All @@ -53,9 +54,14 @@ private JSONOptions(Builder builder) {
allowLeadingZeros = builder.allowLeadingZeros;
allowNonNumericNumbers = builder.allowNonNumericNumbers;
allowUnquotedControlChars = builder.allowUnquotedControlChars;
cudfPruneSchema = builder.cudfPruneSchema;
lineDelimiter = builder.lineDelimiter;
}

public boolean shouldCudfPruneSchema() {
return cudfPruneSchema;
}

public byte getLineDelimiter() {
return lineDelimiter;
}
Expand Down Expand Up @@ -129,8 +135,14 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

private boolean cudfPruneSchema = false;
private byte lineDelimiter = '\n';

public Builder withCudfPruneSchema(boolean prune) {
cudfPruneSchema = prune;
return this;
}

public Builder withLineDelimiter(char delimiter) {
if (delimiter > Byte.MAX_VALUE) {
throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
Expand Down
17 changes: 17 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
Expand All @@ -273,6 +274,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter,
long dsHandle) throws CudfException;

Expand Down Expand Up @@ -1312,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
* @return the file parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, File path) {
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(
readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
Expand All @@ -1326,6 +1332,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {

return gatherJSONColumns(schema, twm, -1);
Expand Down Expand Up @@ -1472,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert len > 0;
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSON(
schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
Expand All @@ -1487,6 +1498,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
}
Expand All @@ -1513,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
opts.isDayFirst(),
Expand All @@ -1526,6 +1542,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter(),
dsHandle))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
Expand Down
12 changes: 9 additions & 3 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1649,7 +1649,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(false);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1703,6 +1704,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.strict_validation(strict_validation)
.mixed_types_as_string(mixed_types_as_string)
.prune_columns(false)
.delimiter(static_cast<char>(line_delimiter))
.keep_quotes(keep_quotes);
if (strict_validation) {
Expand Down Expand Up @@ -1818,6 +1820,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter,
jlong ds_handle)
{
Expand Down Expand Up @@ -1855,7 +1858,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1915,6 +1919,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter)
{
bool read_buffer = true;
Expand Down Expand Up @@ -1966,7 +1971,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down

0 comments on commit b3518ab

Please sign in to comment.