From bd51a25ea6fdab6ab11e95e2c8192ed7eee43e75 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:05:05 -0400 Subject: [PATCH 1/3] [DOC] Document limitation using `cudf.pandas` proxy arrays (#16955) When instantiating a `cudf.pandas` proxy array, a DtoH transfer occurs so that the data buffer is set correctly. We do this because functions which utilize NumPy's C API can utilize the data buffer directly instead of going through `__array__`. This PR documents this limitation. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16955 --- docs/cudf/source/cudf_pandas/faq.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index 34b657488c1..5024747227e 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of: ``` - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version 24.02 of cudf was the last to support pandas 1.5.x. +- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy + array, we create a proxy type that actually subclasses `numpy.ndarray`. We can + verify this with an isinstance check. + + ```python + %load_ext cudf.pandas + import pandas as pd + import numpy as np + + arr = pd.Series([1, 1, 2]).unique() # returns a proxy array + isinstance(arr, np.ndarray) # returns True, where arr is a proxy array + ``` + Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to + access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API. + However, our proxy mechanism is designed to proxy function calls at the Python + level, which is incompatible with these types of accesses. To handle these + situations, we perform an eager device-to-host (DtoH) copy, which sets the data + buffer correctly but incurs the cost of extra time when creating the proxy array. + In the previous example, creating `arr` performed this kind of implicit DtoH transfer. + + With this approach, we also get compatibility with third party libraries like `torch`. + + ```python + import torch + x = torch.from_numpy(arr) + ``` ## Can I force running on the CPU? From c7b51195c675af47d0f3dd69c04d0fcc6920eca5 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 9 Oct 2024 15:17:32 -0700 Subject: [PATCH 2/3] Fix `host_span` constructor to correctly copy `is_device_accessible` (#17020) One of the `host_span` constructors was not updated when we added `is_device_accessible`, so the value was not assigned. This PR fixes this simple error and adds tests that checks that this property is correctly set when creating `host_span`s. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17020 --- cpp/include/cudf/utilities/span.hpp | 2 +- .../utilities_tests/pinned_memory_tests.cpp | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 914731ea417..f3e1a61d075 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base, // NOLINT void>* = nullptr> constexpr host_span(host_span const& other) noexcept - : base(other.data(), other.size()) + : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()} { } diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index ae7c6fa8b8c..1e1e21fe18a 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -125,3 +126,22 @@ TEST_F(PinnedMemoryTest, MakeHostVector) EXPECT_FALSE(vec.get_allocator().is_device_accessible()); } } + +TEST_F(PinnedMemoryTest, HostSpan) +{ + auto test_ctors = [](auto&& vec) { + auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible(); + // Test conversion from a vector + auto const span = cudf::host_span{vec}; + EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible); + // Test conversion from host_span with different type + auto const span_converted = cudf::host_span{span}; + EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible); + }; + + cudf::set_allocate_host_as_pinned_threshold(7); + for (int i = 1; i < 10; i++) { + // some iterations will use pinned memory, some will not + test_ctors(cudf::detail::make_host_vector(i, cudf::get_default_stream())); + } +} From 3791c8a9d1aeb7474bb9ef324a089a569183406c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 9 Oct 2024 13:45:02 -1000 Subject: [PATCH 3/3] Add string.convert_floats APIs to pylibcudf (#16990) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/16990 --- python/cudf/cudf/_lib/string_casting.pyx | 34 ++---- .../_lib/strings/convert/convert_floats.pyx | 24 ++--- .../strings/convert/convert_floats.pxd | 6 +- .../pylibcudf/strings/convert/CMakeLists.txt | 2 +- .../pylibcudf/strings/convert/__init__.pxd | 1 + .../pylibcudf/strings/convert/__init__.py | 1 + .../strings/convert/convert_floats.pxd | 11 ++ .../strings/convert/convert_floats.pyx | 101 ++++++++++++++++++ .../tests/test_string_convert_floats.py | 33 ++++++ 9 files changed, 165 insertions(+), 48 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index d9595f4ab0a..93b67bd4c9d 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -10,10 +10,6 @@ from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - from_floats as cpp_from_floats, - to_floats as cpp_to_floats, -) from pylibcudf.libcudf.strings.convert.convert_integers cimport ( from_integers as cpp_from_integers, hex_to_integers as cpp_hex_to_integers, @@ -33,32 +29,18 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type def floating_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_floats( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_floats.from_floats( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def string_to_floating(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_floats.to_floats( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type) ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_floats( - input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def dtos(Column input_col): diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx index 7965b588703..5da6e3f10cc 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - is_float as cpp_is_float, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def is_float(Column source_strings): @@ -20,12 +13,7 @@ def is_float(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have floats. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_float( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_floats.is_float( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd index f4fc4674506..a45c7f9979e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd @@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_floats( - column_view input_col, + column_view strings, data_type output_type) except + cdef unique_ptr[column] from_floats( - column_view input_col) except + + column_view floats) except + cdef unique_ptr[column] is_float( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 41aeb72039b..7b228c06a18 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx - convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx + convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index b4b0b521e39..be6145384ad 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( convert_datetime, convert_durations, convert_fixed_point, + convert_floats, convert_ipv4, convert_urls, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index 409620fce45..7c94387282b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -4,6 +4,7 @@ convert_datetime, convert_durations, convert_fixed_point, + convert_floats, convert_ipv4, convert_urls, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd new file mode 100644 index 00000000000..1284ff552aa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_floats(Column strings, DataType output_type) + +cpdef Column from_floats(Column floats) + +cpdef Column is_float(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx new file mode 100644 index 00000000000..8081aadb085 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -0,0 +1,101 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_floats as cpp_convert_floats, +) +from pylibcudf.types cimport DataType + + +cpdef Column to_floats(Column strings, DataType output_type): + """ + Returns a new numeric column by parsing float values from each string + in the provided strings column. + + For details, see cpp:func:`cudf::strings::to_floats` + + Parameters + ---------- + strings : Column + Strings instance for this operation. + + output_type : DataType + Type of float numeric column to return. + + Returns + ------- + Column + New column with floats converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_floats.to_floats( + strings.view(), + output_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_floats(Column floats): + """ + Returns a new strings column converting the float values from the + provided column into strings. + + For details, see cpp:func:`cudf::strings::from_floats` + + Parameters + ---------- + floats : Column + Numeric column to convert. + + Returns + ------- + Column + New strings column with floats as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_floats.from_floats( + floats.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_float(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to floats. + + For details, see cpp:func:`cudf::strings::is_float` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_floats.is_float( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py new file mode 100644 index 00000000000..e9918fab559 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_floats(): + typ = pa.float32() + arr = pa.array(["-1.23", "1", None]) + result = plc.strings.convert.convert_floats.to_floats( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_floats(): + arr = pa.array([-1.23, 1, None]) + result = plc.strings.convert.convert_floats.from_floats( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["-1.23", "1.0", None]) + assert_column_eq(result, expected) + + +def test_is_float(): + arr = pa.array(["-1.23", "1", "1.2.3", "A", None]) + result = plc.strings.convert.convert_floats.is_float( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected)