Skip to content

Commit

Permalink
Merge branch 'branch-23.10' into chore/fully_remove_test_mr
Browse files Browse the repository at this point in the history
  • Loading branch information
harrism authored Sep 12, 2023
2 parents 7e46e68 + 258e0fe commit 9e65b07
Show file tree
Hide file tree
Showing 17 changed files with 219 additions and 64 deletions.
1 change: 0 additions & 1 deletion cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6534,7 +6534,6 @@ TEST_F(ParquetReaderTest, FilterFloatNAN)
auto col0 = cudf::test::fixed_width_column_wrapper<float>(elements, elements + num_rows);
auto col1 = cudf::test::fixed_width_column_wrapper<double>(elements, elements + num_rows);

cudf::test::print(col0);
auto const written_table = table_view{{col0, col1}};
auto const filepath = temp_env->get_temp_filepath("FilterFloatNAN.parquet");
{
Expand Down
4 changes: 4 additions & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ dependencies:
cuda: "11.8"
packages:
- *nvcomp
# TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3,
# all CUDA/arch combinations should be supported by existing packages.
- matrix:
packages:
build_wheels:
common:
- output_types: pyproject
Expand Down
25 changes: 24 additions & 1 deletion java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,11 +29,13 @@ public final class JSONOptions extends ColumnFilterOptions {

private final boolean dayFirst;
private final boolean lines;
private final boolean recoverWithNull;

private JSONOptions(Builder builder) {
super(builder);
dayFirst = builder.dayFirst;
lines = builder.lines;
recoverWithNull = builder.recoverWithNull;
}

public boolean isDayFirst() {
Expand All @@ -44,6 +46,11 @@ public boolean isLines() {
return lines;
}

/** Return the value of the recoverWithNull option */
public boolean isRecoverWithNull() {
return recoverWithNull;
}

@Override
String[] getIncludeColumnNames() {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand All @@ -57,6 +64,8 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean dayFirst = false;
private boolean lines = true;

private boolean recoverWithNull = false;

/**
* Whether to parse dates as DD/MM versus MM/DD
* @param dayFirst true: DD/MM, false, MM/DD
Expand All @@ -78,6 +87,20 @@ public Builder withLines(boolean perLine) {
return this;
}

/**
* Specify how to handle invalid lines when parsing json. Setting
* recoverWithNull to true will cause null values to be returned
* for invalid lines. Setting recoverWithNull to false will cause
* the parsing to fail with an exception.
*
* @param recoverWithNull true: return nulls, false: throw exception
* @return builder for chaining
*/
public Builder withRecoverWithNull(boolean recoverWithNull) {
this.recoverWithNull = recoverWithNull;
return this;
}

@Override
public Builder includeColumn(String... names) {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand Down
12 changes: 7 additions & 5 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ private static native long[] readCSV(String[] columnNames,
private static native long readJSON(String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
String filePath, long address, long length,
boolean dayFirst, boolean lines) throws CudfException;
boolean dayFirst, boolean lines,
boolean recoverWithNulls) throws CudfException;

private static native long readAndInferJSON(long address, long length,
boolean dayFirst, boolean lines) throws CudfException;
boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1047,7 +1048,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
path.getAbsolutePath(),
0, 0,
opts.isDayFirst(), opts.isLines()))) {
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {

return gatherJSONColumns(schema, twm);
}
Expand Down Expand Up @@ -1099,7 +1100,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
opts.isDayFirst(), opts.isLines()));
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
}

/**
Expand All @@ -1121,7 +1122,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert offset >= 0 && offset < buffer.length;
try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
schema.getTypeIds(), schema.getTypeScales(), null,
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) {
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
opts.isRecoverWithNull()))) {
return gatherJSONColumns(schema, twm);
}
}
Expand Down
18 changes: 14 additions & 4 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1331,7 +1331,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null) {

JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1344,9 +1345,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
static_cast<std::size_t>(buffer_length)};

auto const recovery_mode = recover_with_null ?
cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
cudf::io::json_recovery_mode_t::FAIL;
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
.dayfirst(static_cast<bool>(day_first))
.lines(static_cast<bool>(lines));
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode);

auto result =
std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
Expand Down Expand Up @@ -1404,7 +1409,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null) {

bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1448,9 +1454,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
static_cast<std::size_t>(buffer_length)} :
cudf::io::source_info{filename.get()};

cudf::io::json_recovery_mode_t recovery_mode =
recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
cudf::io::json_recovery_mode_t::FAIL;
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
.dayfirst(static_cast<bool>(day_first))
.lines(static_cast<bool>(lines));
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode);

if (!n_col_names.is_null() && data_types.size() > 0) {
if (n_col_names.size() != n_types.size()) {
Expand Down
34 changes: 34 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ public class TableTest extends CudfTestBase {
private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");

private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
.column(DType.INT32, "A")
Expand Down Expand Up @@ -326,6 +327,39 @@ void testReadJSONFile() {
}
}

@Test
void testReadJSONFileWithInvalidLines() {
Schema schema = Schema.builder()
.column(DType.STRING, "name")
.column(DType.INT32, "age")
.build();

// test with recoverWithNulls=true
{
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withRecoverWithNull(true)
.build();
try (Table expected = new Table.TestBuilder()
.column("Michael", "Andy", null, "Justin")
.column(null, 30, null, 19)
.build();
Table table = Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)) {
assertTablesAreEqual(expected, table);
}
}

// test with recoverWithNulls=false
{
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withRecoverWithNull(false)
.build();
assertThrows(CudfException.class, () ->
Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE));
}
}

@Test
void testReadJSONFileWithDifferentColumnOrder() {
Schema schema = Schema.builder()
Expand Down
4 changes: 4 additions & 0 deletions java/src/test/resources/people_with_invalid_lines.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"name":"Michael"}
{"name":"Andy", "age":30}
this_line_is_not_valid
{"name":"Justin", "age":19}
6 changes: 6 additions & 0 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,15 @@ find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
ERROR_VARIABLE PYARROW_ERROR
RESULT_VARIABLE PYARROW_RESULT
OUTPUT_STRIP_TRAILING_WHITESPACE
)

if(${PYARROW_RESULT})
message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
endif()

set(targets_using_arrow_headers interop avro csv orc json parquet)
foreach(target IN LISTS targets_using_arrow_headers)
target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
Expand Down
23 changes: 18 additions & 5 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,19 +608,32 @@ def intersection(self, other, sort=False):
(1, 'Blue')],
)
"""
if not can_convert_to_column(other):
raise TypeError("Input must be Index or array-like")

if not isinstance(other, BaseIndex):
other = cudf.Index(other, name=self.name)
other = cudf.Index(
other,
name=getattr(other, "name", self.name),
)

if sort not in {None, False}:
raise ValueError(
f"The 'sort' keyword only takes the values of "
f"None or False; {sort} was passed."
)

if self.equals(other):
if self.has_duplicates:
return self.unique()._get_reconciled_name_object(other)
return self._get_reconciled_name_object(other)
if not len(self) or not len(other) or self.equals(other):
common_dtype = cudf.utils.dtypes._dtype_pandas_compatible(
cudf.utils.dtypes.find_common_type([self.dtype, other.dtype])
)

lhs = self.unique() if self.has_duplicates else self
rhs = other
if not len(other):
lhs, rhs = rhs, lhs

return lhs._get_reconciled_name_object(rhs).astype(common_dtype)

res_name = _get_result_name(self.name, other.name)

Expand Down
16 changes: 10 additions & 6 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from cudf._lib.filling import sequence
from cudf._lib.search import search_sorted
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
is_categorical_dtype,
Expand Down Expand Up @@ -95,7 +96,7 @@ def _lexsorted_equal_range(
return lower_bound, upper_bound, sort_inds


def _index_from_data(data: MutableMapping, name: Any = None):
def _index_from_data(data: MutableMapping, name: Any = no_default):
"""Construct an index of the appropriate type from some data."""

if len(data) == 0:
Expand Down Expand Up @@ -131,7 +132,7 @@ def _index_from_data(data: MutableMapping, name: Any = None):


def _index_from_columns(
columns: List[cudf.core.column.ColumnBase], name: Any = None
columns: List[cudf.core.column.ColumnBase], name: Any = no_default
):
"""Construct an index from ``columns``, with levels named 0, 1, 2..."""
return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
Expand Down Expand Up @@ -681,7 +682,9 @@ def _union(self, other, sort=None):
@_cudf_nvtx_annotate
def _intersection(self, other, sort=False):
if not isinstance(other, RangeIndex):
return super()._intersection(other, sort=sort)
return self._try_reconstruct_range_index(
super()._intersection(other, sort=sort)
)

if not len(self) or not len(other):
return RangeIndex(0)
Expand Down Expand Up @@ -722,7 +725,7 @@ def _intersection(self, other, sort=False):
if sort is None:
new_index = new_index.sort_values()

return new_index
return self._try_reconstruct_range_index(new_index)

@_cudf_nvtx_annotate
def difference(self, other, sort=None):
Expand Down Expand Up @@ -1032,10 +1035,10 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
@classmethod
@_cudf_nvtx_annotate
def _from_data(
cls, data: MutableMapping, name: Any = None
cls, data: MutableMapping, name: Any = no_default
) -> GenericIndex:
out = super()._from_data(data=data)
if name is not None:
if name is not no_default:
out.name = name
return out

Expand Down Expand Up @@ -3334,6 +3337,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
- DatetimeIndex for Datetime input.
- GenericIndex for all other inputs.
"""

kwargs = _setdefault_name(arbitrary, **kwargs)
if isinstance(arbitrary, cudf.MultiIndex):
return arbitrary
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/core/join/_join_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _match_join_keys(
common_type = ltype.categories.dtype
else:
common_type = rtype.categories.dtype
common_type = cudf.utils.dtypes._dtype_pandas_compatible(common_type)
return lcol.astype(common_type), rcol.astype(common_type)

if is_dtype_equal(ltype, rtype):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,10 +605,10 @@ def _from_data(
cls,
data: MutableMapping,
index: Optional[BaseIndex] = None,
name: Any = None,
name: Any = no_default,
) -> Series:
out = super()._from_data(data=data, index=index)
if name is not None:
if name is not no_default:
out.name = name
return out

Expand Down
Loading

0 comments on commit 9e65b07

Please sign in to comment.