From 119aa9d9c5cffc2de460f52f11fb4a78f8b51dce Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:08:26 -1000 Subject: [PATCH 1/7] Add string.convert.convert_fixed_type APIs to pylibcudf (#16984) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16984 --- .../strings/convert/convert_fixed_point.pyx | 69 +++-------- .../strings/convert/convert_fixed_point.pxd | 8 +- .../pylibcudf/strings/convert/CMakeLists.txt | 4 +- .../pylibcudf/strings/convert/__init__.pxd | 7 +- .../pylibcudf/strings/convert/__init__.py | 7 +- .../strings/convert/convert_fixed_point.pxd | 11 ++ .../strings/convert/convert_fixed_point.pyx | 107 ++++++++++++++++++ .../tests/test_string_convert_fixed_point.py | 34 ++++++ 8 files changed, 188 insertions(+), 59 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index a8df8c9a92c..96dcd021c3b 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -1,22 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import cudf - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport ( - from_fixed_point as cpp_from_fixed_point, - is_fixed_point as cpp_is_fixed_point, - to_fixed_point as cpp_to_fixed_point, -) -from pylibcudf.libcudf.types cimport data_type, type_id - from cudf._lib.column cimport Column +from cudf._lib.types cimport dtype_to_pylibcudf_type + +import pylibcudf as plc @acquire_spill_lock() @@ -32,14 +21,10 @@ def from_decimal(Column input_col): ------- A column of strings representing the input decimal values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_fixed_point( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type): ------- A column of decimals parsed from the string values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef int scale = out_type.scale - cdef data_type c_out_type - if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(type_id.DECIMAL32, -scale) - elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(type_id.DECIMAL64, -scale) - elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(type_id.DECIMAL128, -scale) - else: - raise TypeError("should be a decimal dtype") - with nogil: - c_result = move( - cpp_to_fixed_point( - input_column_view, - c_out_type)) - - result = Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type), + ) + result = Column.from_pylibcudf(plc_column) result.dtype.precision = out_type.precision return result @@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype): ------- A Column of booleans indicating valid decimal conversion. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = input_col.view() - cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) - with nogil: - c_result = move(cpp_is_fixed_point( - source_view, - c_dtype - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(dtype), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd index 6f820f3c9a4..72ab329f2dd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd @@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_fixed_point( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] from_fixed_point( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] is_fixed_point( - column_view source_strings, - data_type output_type + column_view input, + data_type decimal_type ) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 3febc78dfd2..fe8da975566 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx + convert_fixed_point.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 5525bca46d6..36abf463371 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_booleans, convert_datetime, convert_durations +from . cimport ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index 2340ebe9a26..c0be4093836 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_booleans, convert_datetime, convert_durations +from . import ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd new file mode 100644 index 00000000000..049b9b3fffe --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_fixed_point(Column input, DataType output_type) + +cpdef Column from_fixed_point(Column input) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx new file mode 100644 index 00000000000..40dadf6f967 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_fixed_point as cpp_fixed_point, +) +from pylibcudf.types cimport DataType, type_id + + +cpdef Column to_fixed_point(Column input, DataType output_type): + """ + Returns a new fixed-point column parsing decimal values from the + provided strings column. + + For details, see :cpp:details:`cudf::strings::to_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of fixed-point column to return including the scale value. + + Returns + ------- + Column + New column of output_type. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.to_fixed_point( + input.view(), + output_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_fixed_point(Column input): + """ + Returns a new strings column converting the fixed-point values + into a strings column. + + For details, see :cpp:details:`cudf::strings::from_fixed_point` + + Parameters + ---------- + input : Column + Fixed-point column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.from_fixed_point( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to fixed-point. + + For details, see :cpp:details:`cudf::strings::is_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + decimal_type : DataType + Fixed-point type (with scale) used only for checking overflow. + Defaults to Decimal64 + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if decimal_type is None: + decimal_type = DataType(type_id.DECIMAL64) + + with nogil: + c_result = move( + cpp_fixed_point.is_fixed_point( + input.view(), + decimal_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py new file mode 100644 index 00000000000..b1c4d729604 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import decimal + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_fixed_point(): + typ = pa.decimal128(38, 2) + arr = pa.array(["123", "1.23", None]) + result = plc.strings.convert.convert_fixed_point.to_fixed_point( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_fixed_point(): + arr = pa.array([decimal.Decimal("1.1"), None]) + result = plc.strings.convert.convert_fixed_point.from_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["1.1", None]) + assert_column_eq(result, expected) + + +def test_is_fixed_point(): + arr = pa.array(["123", "1.23", "1.2.3", "", None]) + result = plc.strings.convert.convert_fixed_point.is_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) From 77f3a5d3229ed1b3186fe9f4d5b5b04d124c6a4d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:33:44 -1000 Subject: [PATCH 2/7] Add docstrings and test for strings.convert_durations APIs for pylibcudf (#16982) Contributes to https://github.com/rapidsai/cudf/issues/15162 Since the implementation already existed: * Added docstrings * Like https://github.com/rapidsai/cudf/pull/16971, made the `format` parameter accept `str` instead * Aligned parameter names closer to pylibcudf * Added missing `move`s * Moved `convert_duration` tests to `test_string_convert_duration.py` and added a new test for `from_durations` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16982 --- python/cudf/cudf/_lib/string_casting.pyx | 7 +- .../strings/convert/convert_durations.pxd | 2 +- .../strings/convert/convert_durations.pxd | 6 +- .../strings/convert/convert_durations.pyx | 73 ++++++++++++++++--- .../pylibcudf/tests/test_string_convert.py | 43 ----------- .../tests/test_string_convert_durations.py | 61 ++++++++++++++++ 6 files changed, 130 insertions(+), 62 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 55ff38f472d..fe19379bf93 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -525,12 +525,11 @@ def timedelta2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.to_durations( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -549,12 +548,10 @@ def int2timedelta(Column input_col, str format): A Column with Timedelta represented in string format """ - - cdef string c_duration_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.from_durations( input_col.to_pylibcudf(mode="read"), - c_duration_format + format ) ) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd index ebe10574353..43ffad1d89f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd @@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_durations( - const column_view & strings_col, + const column_view & input, data_type duration_type, const string & format) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd index ac11b8959ed..eecdade4ef9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=* ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index f3e0b7c9c8e..76c5809c3d5 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -15,27 +15,80 @@ from pylibcudf.types import DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ): + """ + Returns a new duration column converting a strings column into + durations using the provided format pattern. + + For details, see cpp:func:`cudf::strings::to_durations` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + duration_type : DataType + The duration type used for creating the output column. + + format : str + String specifying the duration format in strings. + + Returns + ------- + Column + New duration column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() + with nogil: - c_result = cpp_convert_durations.to_durations( - input.view(), - duration_type.c_obj, - format + c_result = move( + cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + c_format + ) ) return Column.from_libcudf(move(c_result)) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=None ): + """ + Returns a new strings column converting a duration column into + strings using the provided format pattern. + + For details, see cpp:func:`cudf::strings::from_durations` + + Parameters + ---------- + durations : Column + Duration values to convert. + + format : str + The string specifying output format. + Default format is "%D days %H:%M:%S". + + Returns + ------- + Column + New strings column with formatted durations. + """ cdef unique_ptr[column] c_result + + if format is None: + format = "%D days %H:%M:%S" + cdef string c_format = format.encode() + with nogil: - c_result = cpp_convert_durations.from_durations( - input.view(), - format + c_result = move( + cpp_convert_durations.from_durations( + durations.view(), + c_format + ) ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index 22bb4971cb1..69f7a0fdd33 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -1,7 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from datetime import datetime - import pyarrow as pa import pylibcudf as plc import pytest @@ -21,39 +19,16 @@ def timestamp_type(request): return request.param -@pytest.fixture( - scope="module", - params=[ - pa.duration("ns"), - pa.duration("us"), - pa.duration("ms"), - pa.duration("s"), - ], -) -def duration_type(request): - return request.param - - @pytest.fixture(scope="module") def pa_timestamp_col(): return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) -@pytest.fixture(scope="module") -def pa_duration_col(): - return pa.array(["05:20:25"]) - - @pytest.fixture(scope="module") def plc_timestamp_col(pa_timestamp_col): return plc.interop.from_arrow(pa_timestamp_col) -@pytest.fixture(scope="module") -def plc_duration_col(pa_duration_col): - return plc.interop.from_arrow(pa_duration_col) - - @pytest.mark.parametrize("format", ["%Y-%m-%d"]) def test_to_datetime( pa_timestamp_col, plc_timestamp_col, timestamp_type, format @@ -65,21 +40,3 @@ def test_to_datetime( format, ) assert_column_eq(expect, got) - - -@pytest.mark.parametrize("format", ["%H:%M:%S"]) -def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): - def to_timedelta(duration_str): - date = datetime.strptime(duration_str, format) - return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date - - expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( - duration_type - ) - - got = plc.strings.convert.convert_durations.to_durations( - plc_duration_col, - plc.interop.from_arrow(duration_type), - format.encode(), - ) - assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py new file mode 100644 index 00000000000..6d704309bfd --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime, timedelta + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture( + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +def test_to_duration(pa_duration_col, plc_duration_col, duration_type): + format = "%H:%M:%S" + + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format, + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"]) +def test_from_durations(format): + pa_array = pa.array( + [timedelta(days=1, hours=1, minutes=1, seconds=1), None] + ) + result = plc.strings.convert.convert_durations.from_durations( + plc.interop.from_arrow(pa_array), format + ) + expected = pa.array(["1 days 01:01:01", None]) + assert_column_eq(result, expected) From c958d8e88d8c0cb149b1442ab91705853167a609 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 4 Oct 2024 18:45:30 -0500 Subject: [PATCH 3/7] Upgrade pandas pinnings to support `2.2.3` (#16882) Pandas released a newer version `2.2.3` with very minimal fixes but one that adds support for python-3.13 and numpy 2.1 compatibility. This PR updates pinnings in `cudf` to support `2.2.3`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16882 --- ci/test_python_cudf.sh | 2 +- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/pylibcudf/meta.yaml | 2 +- dependencies.yaml | 6 +++++- python/cudf/cudf/core/_compat.py | 2 +- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 9 files changed, 13 insertions(+), 9 deletions(-) diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index 2386414b32e..9528549a562 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi - +rapids-print-env EXITCODE=0 trap "EXITCODE=1" ERR set +e diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8b45d26c367..bd5e6c3d569 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -63,7 +63,7 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc - polars>=1.8,<1.9 - pre-commit diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 354c1360e5a..565a3ebfa3c 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -61,7 +61,7 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc - polars>=1.8,<1.9 - pre-commit diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 25e69b89789..2c254415318 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,7 +78,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - numba-cuda >=0.0.13 - numpy >=1.23,<3.0a0 diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 7c1efa0176c..3d965f30986 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -77,7 +77,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/dependencies.yaml b/dependencies.yaml index b192158c4ea..3561b22965d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -602,7 +602,7 @@ dependencies: packages: - fsspec>=0.6.0 - &numpy numpy>=1.23,<3.0a0 - - pandas>=2.0,<2.2.3dev0 + - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: - output_types: [conda, requirements, pyproject] @@ -748,6 +748,10 @@ dependencies: packages: - *numba-cuda-dep - pandas==2.0.* + - matrix: {dependencies: "latest"} + packages: + - numba-cuda==0.0.15 + - pandas==2.2.3 - matrix: packages: - output_types: conda diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e2bdecbe67a..871ffc6269d 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -3,7 +3,7 @@ import pandas as pd from packaging import version -PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2") +PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3") PANDAS_VERSION = version.parse(pd.__version__) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 605f9be5a49..1b730ffd13c 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "ptxcompiler", "pyarrow>=14.0.0,<18.0.0a0", "pylibcudf==24.12.*,>=0.0.0a0", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 76e47b50c3b..ce825c7647b 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "rapids-dask-dependency==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ From 33b8dfa42ff9a600adfa6d10c7740169a0340338 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 4 Oct 2024 15:30:19 -1000 Subject: [PATCH 4/7] Add string.convert.convert_ipv4 APIs to pylibcudf (#16994) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16994 --- python/cudf/cudf/_lib/string_casting.pyx | 42 +++------ .../libcudf/strings/convert/convert_ipv4.pxd | 6 +- .../pylibcudf/strings/convert/CMakeLists.txt | 2 +- .../pylibcudf/strings/convert/__init__.pxd | 1 + .../pylibcudf/strings/convert/__init__.py | 1 + .../strings/convert/convert_ipv4.pxd | 10 ++ .../strings/convert/convert_ipv4.pyx | 92 +++++++++++++++++++ .../tests/test_string_convert_ipv4.py | 31 +++++++ 8 files changed, 151 insertions(+), 34 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index fe19379bf93..76c862a8657 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -22,11 +22,6 @@ from pylibcudf.libcudf.strings.convert.convert_integers cimport ( is_hex as cpp_is_hex, to_integers as cpp_to_integers, ) -from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( - integers_to_ipv4 as cpp_integers_to_ipv4, - ipv4_to_integers as cpp_ipv4_to_integers, - is_ipv4 as cpp_is_ipv4, -) from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id @@ -569,14 +564,10 @@ def int2ip(Column input_col): A Column with integer represented in string ipv4 format """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_ipv4(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def ip2int(Column input_col): @@ -592,14 +583,10 @@ def ip2int(Column input_col): A Column with ipv4 represented as integer """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ipv4_to_integers(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def is_ipv4(Column source_strings): @@ -608,15 +595,10 @@ def is_ipv4(Column source_strings): that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn where nnn is integer digits in [0,255]. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_ipv4( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.is_ipv4( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def htoi(Column input_col, **kwargs): diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd index fe571cfced6..801db438e92 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd @@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] ipv4_to_integers( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] integers_to_ipv4( - column_view input_col) except + + column_view integers) except + cdef unique_ptr[column] is_ipv4( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index fe8da975566..eb0d6ee6999 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx - convert_fixed_point.pyx + convert_fixed_point.pyx convert_ipv4.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 36abf463371..431beed8e5d 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -4,4 +4,5 @@ from . cimport ( convert_datetime, convert_durations, convert_fixed_point, + convert_ipv4, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index c0be4093836..a601b562c2e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -4,4 +4,5 @@ convert_datetime, convert_durations, convert_fixed_point, + convert_ipv4, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd new file mode 100644 index 00000000000..c61f5c0bdca --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column ipv4_to_integers(Column input) + +cpdef Column integers_to_ipv4(Column integers) + +cpdef Column is_ipv4(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx new file mode 100644 index 00000000000..f2a980d4269 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 + + +cpdef Column ipv4_to_integers(Column input): + """ + Converts IPv4 addresses into integers. + + For details, see cpp:func:`cudf::strings::ipv4_to_integers` + + Parameters + ---------- + input : Column + Strings instance for this operation + + Returns + ------- + Column + New uint32 column converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.ipv4_to_integers( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_ipv4(Column integers): + """ + Converts integers into IPv4 addresses as strings. + + For details, see cpp:func:`cudf::strings::integers_to_ipv4` + + Parameters + ---------- + integers : Column + Integer (uint32) column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.integers_to_ipv4( + integers.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_ipv4(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from IPv4 format. + + For details, see cpp:func:`cudf::strings::is_ipv4` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.is_ipv4( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py new file mode 100644 index 00000000000..4dc3e512624 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_ipv4_to_integers(): + arr = pa.array(["123.45.67.890", None]) + result = plc.strings.convert.convert_ipv4.ipv4_to_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array([2066564730, None], type=pa.uint32()) + assert_column_eq(result, expected) + + +def test_integers_to_ipv4(): + arr = pa.array([1, 0, None], type=pa.uint32()) + result = plc.strings.convert.convert_ipv4.integers_to_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["0.0.0.1", "0.0.0.0", None]) + assert_column_eq(result, expected) + + +def test_is_ipv4(): + arr = pa.array(["0.0.0.1", "1.2.34", "A", None]) + result = plc.strings.convert.convert_ipv4.is_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, False, False, None]) + assert_column_eq(result, expected) From fcff2b6ef7d6db62fc064ad10ffc6c873fc85b58 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Sat, 5 Oct 2024 02:52:53 -0500 Subject: [PATCH 5/7] Fix write_json to handle empty string column (#16995) Add empty string column condition for write_json bypass make_strings_children for empty column because when grid size is zero, it throws cuda error. Authors: - Karthikeyan (https://github.com/karthikeyann) - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - David Wendt (https://github.com/davidwendt) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/16995 --- cpp/src/io/json/write_json.cu | 3 +++ cpp/tests/io/json/json_writer.cpp | 37 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index dc7199d7ab1..e1241f8f90c 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -170,6 +170,9 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + if (column_v.is_empty()) { // empty begets empty + return make_empty_column(type_id::STRING); + } auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp index 2c4e29a01b9..39d31c406a5 100644 --- a/cpp/tests/io/json/json_writer.cpp +++ b/cpp/tests/io/json/json_writer.cpp @@ -70,6 +70,43 @@ TEST_F(JsonWriterTest, EmptyInput) EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); } +TEST_F(JsonWriterTest, EmptyLeaf) +{ + cudf::test::strings_column_wrapper col1{""}; + cudf::test::fixed_width_column_wrapper offsets{0, 0}; + auto col2 = make_lists_column(1, + offsets.release(), + cudf::test::strings_column_wrapper{}.release(), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); + auto col3 = cudf::test::lists_column_wrapper::make_one_empty_row_column(); + cudf::table_view tbl_view{{col1, *col2, col3}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + + // Empty columns in table + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + + // Empty columns in table - JSON Lines + out_buffer.clear(); + out_options.enable_lines(true); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})" + "\n"; + EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); +} + TEST_F(JsonWriterTest, ErrorCases) { cudf::test::strings_column_wrapper col1{"a", "b", "c"}; From bfd568b4f5c4dd9799b60a2975c1fd183e9b99aa Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:04:24 -0400 Subject: [PATCH 6/7] Remove unused import (#17005) This PR removes an unused unused import in cudf which was causing errors in doc builds. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17005 --- python/cudf/cudf/_lib/string_casting.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 76c862a8657..d9595f4ab0a 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -6,7 +6,6 @@ from cudf._lib.scalar import as_device_scalar from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from libcpp.memory cimport unique_ptr -from libcpp.string cimport string from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column From f926a61c7d31b7b33c3a3482507e9efb44b2cc36 Mon Sep 17 00:00:00 2001 From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:37:55 -0400 Subject: [PATCH 7/7] Add release tracking to project automation scripts (#17001) This PR adds two new jobs to the project automations. One to extract the version number from the branch name, and one to set the project `Release` field to the version found. Authors: - Ben Jarmak (https://github.com/jarmak-nv) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17001 --- .../workflows/pr_issue_status_automation.yml | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index af8d1289ea1..6f0e88fb245 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -62,3 +62,33 @@ jobs: UPDATE_ITEM: true UPDATE_LINKED_ISSUES: true secrets: inherit + + process-branch-name: + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: get-project-id + runs-on: ubuntu-latest + outputs: + branch-name: ${{ steps.process-branch-name.outputs.branch-name }} + steps: + - name: Extract branch name + id: process-branch-name + run: | + branch=${{ github.event.pull_request.base.ref }} + release=${branch#branch-} + echo "branch-name=$release" >> "$GITHUB_OUTPUT" + + update-release: + # This job sets the PR and its linked issues to the release they are targeting + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12 + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: [get-project-id, process-branch-name] + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ" + SINGLE_SELECT_FIELD_NAME: "Release" + SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit