Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into ref/index2
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored Apr 10, 2024
2 parents d676074 + 460b41e commit bba4364
Show file tree
Hide file tree
Showing 17 changed files with 402 additions and 91 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/status.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,18 @@ jobs:
state: CUSTOM_STATE = 'success'
} = contentJSON;
// Fetch the first job ID from the workflow run
const jobs = await github.rest.actions.listJobsForWorkflowRun({
// Fetch all jobs using pagination
const jobs = await github.paginate(
github.rest.actions.listJobsForWorkflowRun,
{
owner: context.repo.owner,
repo: context.repo.repo,
run_id: process.env.WORKFLOW_RUN_ID,
});
const job = jobs.data.jobs.find(job => job.name === JOB_NAME);
}
);
// Fetch the first job ID from the workflow run
const job = jobs.find(job => job.name === JOB_NAME);
const JOB_ID = job ? job.id : null;
// Set default target URL if not defined
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand Down
297 changes: 297 additions & 0 deletions CHANGELOG.md

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions ci/cudf_pandas_scripts/pandas-tests/diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
GH_JOB_NAME="pandas-tests-diff / build"
rapids-logger "Github job name: ${GH_JOB_NAME}"

MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json
PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py39.pr-results.json
PY_VER="39"
MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json

rapids-logger "Fetching latest available results from nightly"
aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
cat s3_output.txt
aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt

read -r COMPARE_ENV < s3_output.txt
export COMPARE_ENV
rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
Expand Down
4 changes: 3 additions & 1 deletion cpp/examples/build.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#!/bin/bash

# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

# libcudf examples build script

set -euo pipefail

# Parallelism control
PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}

Expand Down
4 changes: 3 additions & 1 deletion cpp/examples/strings/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/column/column_view.hpp>
#include <cudf/io/csv.hpp>
#include <cudf/io/datasource.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>

Expand Down Expand Up @@ -110,7 +111,8 @@ int main(int argc, char const** argv)

std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
std::cout << "Wall time: " << elapsed.count() << " seconds\n";
std::cout << "Output size " << result->view().child(1).size() << " bytes\n";
auto const scv = cudf::strings_column_view(result->view());
std::cout << "Output size " << scv.chars_size(rmm::cuda_stream_default) << " bytes\n";

return 0;
}
8 changes: 6 additions & 2 deletions cpp/examples/strings/custom_optimized.cu
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,12 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
*d_names, *d_visibilities, offsets.data(), chars.data());

// create column from offsets and chars vectors (no copy is performed)
auto result = cudf::make_strings_column(names.size(), std::move(offsets), chars.release(), {}, 0);
// create column from offsets vector (move only)
auto offsets_column = std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0);

// create column for chars vector (no copy is performed)
auto result = cudf::make_strings_column(
names.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});

// wait for all of the above to finish
stream.synchronize();
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def set_rand_params(self, params):
if dtype_val is not None:
dtype_val = {
col_name: "category"
if cudf.utils.dtypes._is_categorical_dtype(dtype)
if isinstance(dtype, cudf.CategoricalDtype)
else pandas_dtypes_to_np_dtypes[dtype]
for col_name, dtype in dtype_val.items()
}
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val):
if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
processed_dtypes = {}
for col_name, dtype in dtype_val.items():
if cudf.utils.dtypes._is_categorical_dtype(dtype):
if isinstance(dtype, cudf.CategoricalDtype):
processed_dtypes[col_name] = "category"
else:
processed_dtypes[col_name] = str(
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -434,19 +434,19 @@ def read_csv(
if dtype is not None:
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
if cudf.api.types._is_categorical_dtype(v):
if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
df._data[str(k)] = df._data[str(k)].astype(v)
elif (
cudf.api.types.is_scalar(dtype) or
isinstance(dtype, (
np.dtype, pd.api.extensions.ExtensionDtype, type
))
):
if cudf.api.types._is_categorical_dtype(dtype):
if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
df = df.astype(dtype)
elif isinstance(dtype, abc.Collection):
for index, col_dtype in enumerate(dtype):
if cudf.api.types._is_categorical_dtype(col_dtype):
if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
col_name = df._data.names[index]
df._data[col_name] = df._data[col_name].astype(col_dtype)

Expand Down Expand Up @@ -554,11 +554,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
# TODO: Remove this work-around Dictionary types
# in libcudf are fully mapped to categorical columns:
# https://github.com/rapidsai/cudf/issues/3960
if cudf.api.types._is_categorical_dtype(dtype):
if isinstance(dtype, str):
dtype = "str"
else:
dtype = dtype.categories.dtype
if isinstance(dtype, cudf.CategoricalDtype):
dtype = dtype.categories.dtype
elif dtype == "category":
dtype = "str"

if isinstance(dtype, str):
if str(dtype) == "date32":
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
from cudf._lib.types import size_type_dtype
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.api.types import (
_is_categorical_dtype,
_is_non_decimal_numeric_dtype,
_is_pandas_nullable_extension_dtype,
infer_dtype,
Expand Down Expand Up @@ -1381,7 +1380,7 @@ def column_empty_like(

if (
hasattr(column, "dtype")
and _is_categorical_dtype(column.dtype)
and isinstance(column.dtype, cudf.CategoricalDtype)
and dtype == column.dtype
):
catcolumn = cast("cudf.core.column.CategoricalColumn", column)
Expand Down Expand Up @@ -2008,7 +2007,9 @@ def as_column(
length = 1
elif length < 0:
raise ValueError(f"{length=} must be >=0.")
if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
if isinstance(
arbitrary, pd.Interval
) or cudf.api.types._is_categorical_dtype(dtype):
# No cudf.Scalar support yet
return as_column(
pd.Series([arbitrary] * length),
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def dtype(arbitrary):
raise TypeError(f"Unsupported type {np_dtype}")
return np_dtype

if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}:
# read_csv only accepts "hex"
# e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow
return arbitrary

# use `pandas_dtype` to try and interpret
# `arbitrary` as a Pandas extension type.
# Return the corresponding NumPy/cuDF type.
Expand Down Expand Up @@ -999,7 +1004,10 @@ def _is_categorical_dtype(obj):
pd.Series,
),
):
return _is_categorical_dtype(obj.dtype)
try:
return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype)
except TypeError:
return False
if hasattr(obj, "type"):
if obj.type is pd.CategoricalDtype.type:
return True
Expand Down
24 changes: 12 additions & 12 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@

import cudf
from cudf._lib.unary import is_nan
from cudf.api.types import (
_is_categorical_dtype,
is_numeric_dtype,
is_string_dtype,
)
from cudf.api.types import is_numeric_dtype, is_string_dtype
from cudf.core.missing import NA, NaT


Expand Down Expand Up @@ -86,7 +82,7 @@ def _check_types(
if (
exact
and not isinstance(left, cudf.MultiIndex)
and _is_categorical_dtype(left)
and isinstance(left.dtype, cudf.CategoricalDtype)
):
if left.dtype != right.dtype:
raise_assert_detail(
Expand Down Expand Up @@ -144,8 +140,8 @@ def assert_column_equal(
"""
if check_dtype is True:
if (
_is_categorical_dtype(left)
and _is_categorical_dtype(right)
isinstance(left.dtype, cudf.CategoricalDtype)
and isinstance(right.dtype, cudf.CategoricalDtype)
and not check_categorical
):
pass
Expand Down Expand Up @@ -173,7 +169,9 @@ def assert_column_equal(
return

if check_exact and check_categorical:
if _is_categorical_dtype(left) and _is_categorical_dtype(right):
if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
right.dtype, cudf.CategoricalDtype
):
left_cat = left.categories
right_cat = right.categories

Expand Down Expand Up @@ -207,8 +205,8 @@ def assert_column_equal(

if (
not check_dtype
and _is_categorical_dtype(left)
and _is_categorical_dtype(right)
and isinstance(left.dtype, cudf.CategoricalDtype)
and isinstance(right.dtype, cudf.CategoricalDtype)
):
left = left.astype(left.categories.dtype)
right = right.astype(right.categories.dtype)
Expand Down Expand Up @@ -258,7 +256,9 @@ def assert_column_equal(
raise e
else:
columns_equal = False
if _is_categorical_dtype(left) and _is_categorical_dtype(right):
if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
right.dtype, cudf.CategoricalDtype
):
left = left.astype(left.categories.dtype)
right = right.astype(right.categories.dtype)
if not columns_equal:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
children=col.base_children,
)

if cudf.api.types._is_categorical_dtype(col.dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
assert col.size == col.codes.size
assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
elif cudf.api.types.is_string_dtype(col.dtype):
Expand Down Expand Up @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False):
else:
pd_series = series.to_pandas()

if cudf.api.types._is_categorical_dtype(col.dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
# The cudf.Series is constructed from an already sliced column, whereas
# the pandas.Series is constructed from the unsliced series and then
# sliced, so the indexes should be different and we must ignore it.
Expand Down
66 changes: 21 additions & 45 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest

import cudf
from cudf.api.types import _is_categorical_dtype
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
from cudf.testing._utils import (
assert_eq,
Expand Down Expand Up @@ -609,8 +608,8 @@ def test_concat_empty_dataframes(df, other, ignore_index):
actual = cudf.concat(other_gd, ignore_index=ignore_index)
if expected.shape != df.shape:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if not _is_categorical_dtype(expected[key].dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
if not isinstance(expected[key].dtype, pd.CategoricalDtype):
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = expected[key].fillna("-1").astype("str")
Expand Down Expand Up @@ -1195,10 +1194,10 @@ def test_concat_join_series(ignore_index, sort, join, axis):
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("join", ["inner", "outer"])
@pytest.mark.parametrize("axis", [0])
def test_concat_join_empty_dataframes(
df, other, ignore_index, axis, join, sort
request, df, other, ignore_index, join, sort
):
axis = 0
other_pd = [df] + other
gdf = cudf.from_pandas(df)
other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
Expand All @@ -1209,50 +1208,27 @@ def test_concat_join_empty_dataframes(
actual = cudf.concat(
other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort
)
if expected.shape != df.shape:
if axis == 0:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if not _is_categorical_dtype(expected[key].dtype):
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = (
expected[key].fillna("-1").astype("str")
)
else:
expected[key] = (
expected[key]
.cat.add_categories(["-1"])
.fillna("-1")
.astype("str")
)
actual[key] = col.astype("str").fillna("-1")
else:
expected[key] = expected[key].fillna(-1)
actual[key] = col.fillna(-1)

assert_eq(
expected.fillna(-1),
actual.fillna(-1),
check_dtype=False,
check_index_type=False
if len(expected) == 0 or actual.empty
else True,
check_column_type=False,
)
else:
# no need to fill in if axis=1
assert_eq(
expected,
actual,
check_index_type=False,
check_column_type=False,
if (
join == "outer"
and any(
isinstance(dtype, pd.CategoricalDtype)
for dtype in df.dtypes.tolist()
)
and any(
isinstance(dtype, pd.CategoricalDtype)
for other_df in other
for dtype in other_df.dtypes.tolist()
)
):
request.applymarker(
pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/42840"
)
)
assert_eq(
expected,
actual,
check_dtype=False,
check_index_type=False,
check_column_type=False,
)

Expand Down Expand Up @@ -1332,7 +1308,7 @@ def test_concat_join_empty_dataframes_axis_1(
if expected.shape != df.shape:
if axis == 0:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if isinstance(expected[key].dtype, pd.CategoricalDtype):
expected[key] = expected[key].fillna("-1")
actual[key] = col.astype("str").fillna("-1")
# if not expected.empty:
Expand Down
Loading

0 comments on commit bba4364

Please sign in to comment.