Skip to content

Commit

Permalink
qMerge remote-tracking branch 'upstream/feature/cudf-polars' into fea…
Browse files Browse the repository at this point in the history
…t/manas_polars_docs
  • Loading branch information
Manas Singh committed Sep 16, 2024
2 parents 36bb894 + 1b5cb1a commit 71ea75a
Show file tree
Hide file tree
Showing 14 changed files with 554 additions and 66 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
- wheel-tests-cudf
- wheel-build-cudf-polars
- wheel-tests-cudf-polars
- cudf-polars-polars-tests
- wheel-build-dask-cudf
- wheel-tests-dask-cudf
- devcontainer
Expand Down Expand Up @@ -154,6 +155,17 @@ jobs:
# This always runs, but only fails if this PR touches code in
# pylibcudf or cudf_polars
script: "ci/test_wheel_cudf_polars.sh"
cudf-polars-polars-tests:
needs: wheel-build-cudf-polars
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: pull-request
# This always runs, but only fails if this PR touches code in
# pylibcudf or cudf_polars
script: "ci/test_cudf_polars_polars_tests.sh"
wheel-build-dask-cudf:
needs: wheel-build-cudf
secrets: inherit
Expand Down
27 changes: 27 additions & 0 deletions ci/run_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION.

set -euo pipefail

# Support invoking run_cudf_polars_pytests.sh outside the script directory
# Assumption, polars has been cloned in the root of the repo.
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/

DESELECTED_TESTS=(
"tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place
"tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
"tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
"tests/docs/test_user_guide.py" # No dot binary in CI image
)

DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
python -m pytest \
--import-mode=importlib \
--cache-clear \
-m "" \
-p cudf_polars.testing.plugin \
-v \
--tb=short \
${DESELECTED_TESTS} \
"$@" \
py-polars/tests
68 changes: 68 additions & 0 deletions ci/test_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION.

set -eou pipefail

# We will only fail these tests if the PR touches code in pylibcudf
# or cudf_polars itself.
# Note, the three dots mean we are doing diff between the merge-base
# of upstream and HEAD. So this is asking, "does _this branch_ touch
# files in cudf_polars/pylibcudf", rather than "are there changes
# between upstream and this branch which touch cudf_polars/pylibcudf"
# TODO: is the target branch exposed anywhere in an environment variable?
if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
then
HAS_CHANGES=1
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
else
HAS_CHANGES=0
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
fi

rapids-logger "Download wheels"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist

# Download the cudf built in the previous step
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep

rapids-logger "Install cudf"
python -m pip install ./local-cudf-dep/cudf*.whl

rapids-logger "Install cudf_polars"
python -m pip install $(echo ./dist/cudf_polars*.whl)

TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
rapids-logger "Clone polars to ${TAG}"
git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1

# Install requirements for running polars tests
rapids-logger "Install polars test requirements"
python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt

function set_exitcode()
{
EXITCODE=$?
}
EXITCODE=0
trap set_exitcode ERR
set +e

rapids-logger "Run polars tests"
./ci/run_cudf_polars_polars_tests.sh

trap ERR
set -e

if [ ${EXITCODE} != 0 ]; then
rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}"
else
rapids-logger "Running polars test suite PASSED"
fi

if [ ${HAS_CHANGES} == 1 ]; then
exit ${EXITCODE}
else
exit 0
fi
6 changes: 6 additions & 0 deletions ci/test_wheel_cudf_polars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,21 @@ set -eou pipefail
if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
then
HAS_CHANGES=1
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
else
HAS_CHANGES=0
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
fi

rapids-logger "Download wheels"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist

# Download the cudf built in the previous step
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep

rapids-logger "Install cudf"
python -m pip install ./local-cudf-dep/cudf*.whl

rapids-logger "Install cudf_polars"
Expand Down
42 changes: 5 additions & 37 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar

import cudf._lib.pylibcudf as plc


@acquire_spill_lock()
def add_months(Column col, Column months):
Expand All @@ -37,43 +39,9 @@ def add_months(Column col, Column months):

@acquire_spill_lock()
def extract_datetime_component(Column col, object field):

cdef unique_ptr[column] c_result
cdef column_view col_view = col.view()

with nogil:
if field == "year":
c_result = move(libcudf_datetime.extract_year(col_view))
elif field == "month":
c_result = move(libcudf_datetime.extract_month(col_view))
elif field == "day":
c_result = move(libcudf_datetime.extract_day(col_view))
elif field == "weekday":
c_result = move(libcudf_datetime.extract_weekday(col_view))
elif field == "hour":
c_result = move(libcudf_datetime.extract_hour(col_view))
elif field == "minute":
c_result = move(libcudf_datetime.extract_minute(col_view))
elif field == "second":
c_result = move(libcudf_datetime.extract_second(col_view))
elif field == "millisecond":
c_result = move(
libcudf_datetime.extract_millisecond_fraction(col_view)
)
elif field == "microsecond":
c_result = move(
libcudf_datetime.extract_microsecond_fraction(col_view)
)
elif field == "nanosecond":
c_result = move(
libcudf_datetime.extract_nanosecond_fraction(col_view)
)
elif field == "day_of_year":
c_result = move(libcudf_datetime.day_of_year(col_view))
else:
raise ValueError(f"Invalid datetime field: '{field}'")

result = Column.from_unique_ptr(move(c_result))
result = Column.from_pylibcudf(
plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
)

if field == "weekday":
# Pandas counts Monday-Sunday as 0-6
Expand Down
49 changes: 49 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.datetime cimport (
day_of_year as cpp_day_of_year,
extract_day as cpp_extract_day,
extract_hour as cpp_extract_hour,
extract_microsecond_fraction as cpp_extract_microsecond_fraction,
extract_millisecond_fraction as cpp_extract_millisecond_fraction,
extract_minute as cpp_extract_minute,
extract_month as cpp_extract_month,
extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
extract_second as cpp_extract_second,
extract_weekday as cpp_extract_weekday,
extract_year as cpp_extract_year,
)

Expand Down Expand Up @@ -31,3 +41,42 @@ cpdef Column extract_year(
with nogil:
result = move(cpp_extract_year(values.view()))
return Column.from_libcudf(move(result))


def extract_datetime_component(Column col, str field):

cdef unique_ptr[column] c_result

with nogil:
if field == "year":
c_result = move(cpp_extract_year(col.view()))
elif field == "month":
c_result = move(cpp_extract_month(col.view()))
elif field == "day":
c_result = move(cpp_extract_day(col.view()))
elif field == "weekday":
c_result = move(cpp_extract_weekday(col.view()))
elif field == "hour":
c_result = move(cpp_extract_hour(col.view()))
elif field == "minute":
c_result = move(cpp_extract_minute(col.view()))
elif field == "second":
c_result = move(cpp_extract_second(col.view()))
elif field == "millisecond":
c_result = move(
cpp_extract_millisecond_fraction(col.view())
)
elif field == "microsecond":
c_result = move(
cpp_extract_microsecond_fraction(col.view())
)
elif field == "nanosecond":
c_result = move(
cpp_extract_nanosecond_fraction(col.view())
)
elif field == "day_of_year":
c_result = move(cpp_day_of_year(col.view()))
else:
raise ValueError(f"Invalid datetime field: '{field}'")

return Column.from_libcudf(move(c_result))
42 changes: 38 additions & 4 deletions python/cudf/cudf/pylibcudf_tests/test_datetime.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import datetime
import functools

import pyarrow as pa
import pyarrow.compute as pc
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture
def column(has_nulls):
def date_column(has_nulls):
values = [
datetime.date(1999, 1, 1),
datetime.date(2024, 10, 12),
Expand All @@ -22,9 +24,41 @@ def column(has_nulls):
return plc.interop.from_arrow(pa.array(values, type=pa.date32()))


def test_extract_year(column):
got = plc.datetime.extract_year(column)
@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
def datetime_column(has_nulls, request):
values = [
datetime.datetime(1999, 1, 1),
datetime.datetime(2024, 10, 12),
datetime.datetime(1970, 1, 1),
datetime.datetime(2260, 1, 1),
datetime.datetime(2024, 2, 29, 3, 14, 15),
datetime.datetime(2024, 2, 29, 3, 14, 15, 999),
]
if has_nulls:
values[2] = None
return plc.interop.from_arrow(
pa.array(values, type=pa.timestamp(request.param))
)


@pytest.mark.parametrize(
"component, pc_fun",
[
("year", pc.year),
("month", pc.month),
("day", pc.day),
("weekday", functools.partial(pc.day_of_week, count_from_zero=False)),
("hour", pc.hour),
("minute", pc.minute),
("second", pc.second),
("millisecond", pc.millisecond),
("microsecond", pc.microsecond),
("nanosecond", pc.nanosecond),
],
)
def test_extraction(datetime_column, component, pc_fun):
got = plc.datetime.extract_datetime_component(datetime_column, component)
# libcudf produces an int16, arrow produces an int64
expect = pa.compute.year(plc.interop.to_arrow(column)).cast(pa.int16())
expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16())

assert_column_eq(expect, got)
Loading

0 comments on commit 71ea75a

Please sign in to comment.