Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into pylibcudf/strings/findall
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 authored Sep 24, 2024
2 parents e76c796 + 73fa557 commit 5b9c66e
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 24 deletions.
4 changes: 2 additions & 2 deletions ci/cudf_pandas_scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ else

echo "" > ./constraints.txt
if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
# `test_python` constraints are for `[test]` not `[cudf-pandas-tests]`
# `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]`
rapids-dependency-file-generator \
--output requirements \
--file-key test_python \
--file-key test_python_cudf_pandas \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
| tee ./constraints.txt
fi
Expand Down
6 changes: 3 additions & 3 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}

# Need to distutils-normalize the versions for some use cases
CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))")
NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")

echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"

Expand Down
4 changes: 2 additions & 2 deletions ci/test_python_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ set -euo pipefail
rapids-logger "Generate Python testing dependencies"

ENV_YAML_DIR="$(mktemp -d)"

FILE_KEY=$1
rapids-dependency-file-generator \
--output conda \
--file-key test_python \
--file-key ${FILE_KEY} \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
| tee "${ENV_YAML_DIR}/env.yaml"

Expand Down
2 changes: 1 addition & 1 deletion ci/test_python_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../;

# Common setup steps shared by Python test jobs
source ./ci/test_python_common.sh
source ./ci/test_python_common.sh test_python_cudf

rapids-logger "Check GPU usage"
nvidia-smi
Expand Down
2 changes: 1 addition & 1 deletion ci/test_python_other.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../

# Common setup steps shared by Python test jobs
source ./ci/test_python_common.sh
source ./ci/test_python_common.sh test_python_other

rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
Expand Down
36 changes: 32 additions & 4 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,28 @@ files:
includes:
- cuda_version
- test_cpp
test_python:
test_python_cudf_pandas:
output: none
includes:
- cuda_version
- py_version
- test_python_common
- test_python_cudf
- test_python_dask_cudf
- test_python_cudf_pandas
test_python_cudf:
output: none
includes:
- cuda_version
- py_version
- test_python_common
- test_python_cudf
test_python_other:
output: none
includes:
- cuda_version
- py_version
- test_python_common
- test_python_dask_cudf
test_java:
output: none
includes:
Expand Down Expand Up @@ -707,9 +720,7 @@ dependencies:
- matrix: {dependencies: "oldest"}
packages:
- numba==0.57.*
- numpy==1.23.*
- pandas==2.0.*
- pyarrow==14.0.0
- matrix:
packages:
- output_types: conda
Expand Down Expand Up @@ -764,6 +775,14 @@ dependencies:
- &transformers transformers==4.39.3
- tzdata
specific:
- output_types: [conda, requirements]
matrices:
- matrix: {dependencies: "oldest"}
packages:
- numpy==1.23.*
- pyarrow==14.0.0
- matrix:
packages:
- output_types: conda
matrices:
- matrix:
Expand All @@ -783,6 +802,15 @@ dependencies:
packages:
- dask-cuda==24.10.*,>=0.0.0a0
- *numba
specific:
- output_types: [conda, requirements]
matrices:
- matrix: {dependencies: "oldest"}
packages:
- numpy==1.24.*
- pyarrow==14.0.1
- matrix:
packages:
depends_on_libcudf:
common:
- output_types: conda
Expand Down
28 changes: 28 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean allowLeadingZeros;
private final boolean allowNonNumericNumbers;
private final boolean allowUnquotedControlChars;
private final boolean cudfPruneSchema;
private final byte lineDelimiter;

private JSONOptions(Builder builder) {
super(builder);
Expand All @@ -52,6 +54,16 @@ private JSONOptions(Builder builder) {
allowLeadingZeros = builder.allowLeadingZeros;
allowNonNumericNumbers = builder.allowNonNumericNumbers;
allowUnquotedControlChars = builder.allowUnquotedControlChars;
cudfPruneSchema = builder.cudfPruneSchema;
lineDelimiter = builder.lineDelimiter;
}

public boolean shouldCudfPruneSchema() {
return cudfPruneSchema;
}

public byte getLineDelimiter() {
return lineDelimiter;
}

public boolean isDayFirst() {
Expand Down Expand Up @@ -123,6 +135,22 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

private boolean cudfPruneSchema = false;
private byte lineDelimiter = '\n';

public Builder withCudfPruneSchema(boolean prune) {
cudfPruneSchema = prune;
return this;
}

public Builder withLineDelimiter(char delimiter) {
if (delimiter > Byte.MAX_VALUE) {
throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
}
lineDelimiter = (byte)delimiter;
return this;
}

/**
* Should json validation be strict or not
*/
Expand Down
36 changes: 31 additions & 5 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,9 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl) throws CudfException;
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
Expand All @@ -272,6 +274,8 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter,
long dsHandle) throws CudfException;

private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
Expand All @@ -284,6 +288,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
byte lineDelimiter,
long dsHandle) throws CudfException;

private static native long readAndInferJSON(long address, long length,
Expand All @@ -297,7 +302,8 @@ private static native long readAndInferJSON(long address, long length,
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl) throws CudfException;
boolean allowUnquotedControl,
byte lineDelimiter) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1308,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
* @return the file parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, File path) {
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(
readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
Expand All @@ -1321,7 +1331,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {

return gatherJSONColumns(schema, twm, -1);
}
Expand Down Expand Up @@ -1404,7 +1416,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()));
opts.unquotedControlChars(),
opts.getLineDelimiter()));
}

/**
Expand All @@ -1426,6 +1439,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
opts.getLineDelimiter(),
dsHandle));
return twm;
} finally {
Expand Down Expand Up @@ -1465,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert len > 0;
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSON(
schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
Expand All @@ -1479,7 +1497,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
}
}
Expand All @@ -1505,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
opts.isDayFirst(),
Expand All @@ -1518,6 +1542,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter(),
dsHandle))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
} finally {
Expand Down
24 changes: 19 additions & 5 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jbyte line_delimiter,
jlong ds_handle)
{
JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
Expand All @@ -1646,8 +1647,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(false);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1676,7 +1679,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control)
jboolean allow_unquoted_control,
jbyte line_delimiter)
{
JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1700,6 +1704,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.strict_validation(strict_validation)
.mixed_types_as_string(mixed_types_as_string)
.prune_columns(false)
.delimiter(static_cast<char>(line_delimiter))
.keep_quotes(keep_quotes);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
Expand Down Expand Up @@ -1814,6 +1820,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter,
jlong ds_handle)
{
JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
Expand Down Expand Up @@ -1848,8 +1856,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1908,7 +1918,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control)
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter)
{
bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1957,8 +1969,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down
Loading

0 comments on commit 5b9c66e

Please sign in to comment.