Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.12' into pylibcudf/s…
Browse files Browse the repository at this point in the history
…trings/convert_lists
  • Loading branch information
mroeschke committed Oct 9, 2024
2 parents edfaa92 + 3791c8a commit 910241e
Show file tree
Hide file tree
Showing 12 changed files with 214 additions and 50 deletions.
2 changes: 1 addition & 1 deletion cpp/include/cudf/utilities/span.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
std::is_convertible_v<OtherT (*)[], T (*)[]>, // NOLINT
void>* = nullptr>
constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
: base(other.data(), other.size())
: base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
{
}

Expand Down
20 changes: 20 additions & 0 deletions cpp/tests/utilities_tests/pinned_memory_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/io/parquet.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/pinned_memory.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/mr/device/pool_memory_resource.hpp>
#include <rmm/mr/pinned_host_memory_resource.hpp>
Expand Down Expand Up @@ -125,3 +126,22 @@ TEST_F(PinnedMemoryTest, MakeHostVector)
EXPECT_FALSE(vec.get_allocator().is_device_accessible());
}
}

TEST_F(PinnedMemoryTest, HostSpan)
{
auto test_ctors = [](auto&& vec) {
auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible();
// Test conversion from a vector
auto const span = cudf::host_span<int16_t>{vec};
EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible);
// Test conversion from host_span with different type
auto const span_converted = cudf::host_span<int16_t const>{span};
EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible);
};

cudf::set_allocate_host_as_pinned_threshold(7);
for (int i = 1; i < 10; i++) {
// some iterations will use pinned memory, some will not
test_ctors(cudf::detail::make_host_vector<int16_t>(i, cudf::get_default_stream()));
}
}
26 changes: 26 additions & 0 deletions docs/cudf/source/cudf_pandas/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of:
```
- `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version
24.02 of cudf was the last to support pandas 1.5.x.
- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy
array, we create a proxy type that actually subclasses `numpy.ndarray`. We can
verify this with an isinstance check.

```python
%load_ext cudf.pandas
import pandas as pd
import numpy as np

arr = pd.Series([1, 1, 2]).unique() # returns a proxy array
isinstance(arr, np.ndarray) # returns True, where arr is a proxy array
```
Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to
access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API.
However, our proxy mechanism is designed to proxy function calls at the Python
level, which is incompatible with these types of accesses. To handle these
situations, we perform an eager device-to-host (DtoH) copy, which sets the data
buffer correctly but incurs the cost of extra time when creating the proxy array.
In the previous example, creating `arr` performed this kind of implicit DtoH transfer.

With this approach, we also get compatibility with third party libraries like `torch`.

```python
import torch
x = torch.from_numpy(arr)
```

## Can I force running on the CPU?

Expand Down
34 changes: 8 additions & 26 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@ from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.convert.convert_floats cimport (
from_floats as cpp_from_floats,
to_floats as cpp_to_floats,
)
from pylibcudf.libcudf.strings.convert.convert_integers cimport (
from_integers as cpp_from_integers,
hex_to_integers as cpp_hex_to_integers,
Expand All @@ -33,32 +29,18 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type


def floating_to_string(Column input_col):
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_floats(
input_column_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_floats.from_floats(
input_col.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(plc_column)


def string_to_floating(Column input_col, object out_type):
cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
)
plc_column = plc.strings.convert.convert_floats.to_floats(
input_col.to_pylibcudf(mode="read"),
dtype_to_pylibcudf_type(out_type)
)
cdef data_type c_out_type = data_type(tid)
with nogil:
c_result = move(
cpp_to_floats(
input_column_view,
c_out_type))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


def dtos(Column input_col):
Expand Down
24 changes: 6 additions & 18 deletions python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
Original file line number Diff line number Diff line change
@@ -1,31 +1,19 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.convert.convert_floats cimport (
is_float as cpp_is_float,
)

from cudf._lib.column cimport Column

import pylibcudf as plc


@acquire_spill_lock()
def is_float(Column source_strings):
"""
Returns a Column of boolean values with True for `source_strings`
that have floats.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_is_float(
source_view
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.convert.convert_floats.is_float(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type
cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] to_floats(
column_view input_col,
column_view strings,
data_type output_type) except +

cdef unique_ptr[column] from_floats(
column_view input_col) except +
column_view floats) except +

cdef unique_ptr[column] is_float(
column_view source_strings
column_view input
) except +
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# the License.
# =============================================================================

set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
convert_fixed_point.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
set(cython_sources
convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ from . cimport (
convert_datetime,
convert_durations,
convert_fixed_point,
convert_floats,
convert_ipv4,
convert_lists,
convert_urls,
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
convert_datetime,
convert_durations,
convert_fixed_point,
convert_floats,
convert_ipv4,
convert_lists,
convert_urls,
Expand Down
11 changes: 11 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.types cimport DataType


cpdef Column to_floats(Column strings, DataType output_type)

cpdef Column from_floats(Column floats)

cpdef Column is_float(Column input)
101 changes: 101 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.strings.convert cimport (
convert_floats as cpp_convert_floats,
)
from pylibcudf.types cimport DataType


cpdef Column to_floats(Column strings, DataType output_type):
"""
Returns a new numeric column by parsing float values from each string
in the provided strings column.
For details, see cpp:func:`cudf::strings::to_floats`
Parameters
----------
strings : Column
Strings instance for this operation.
output_type : DataType
Type of float numeric column to return.
Returns
-------
Column
New column with floats converted from strings.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_convert_floats.to_floats(
strings.view(),
output_type.c_obj,
)
)

return Column.from_libcudf(move(c_result))


cpdef Column from_floats(Column floats):
"""
Returns a new strings column converting the float values from the
provided column into strings.
For details, see cpp:func:`cudf::strings::from_floats`
Parameters
----------
floats : Column
Numeric column to convert.
Returns
-------
Column
New strings column with floats as strings.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_convert_floats.from_floats(
floats.view(),
)
)

return Column.from_libcudf(move(c_result))


cpdef Column is_float(Column input):
"""
Returns a boolean column identifying strings in which all
characters are valid for conversion to floats.
For details, see cpp:func:`cudf::strings::is_float`
Parameters
----------
input : Column
Strings instance for this operation.
Returns
-------
Column
New column of boolean results for each string.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_convert_floats.is_float(
input.view(),
)
)

return Column.from_libcudf(move(c_result))
33 changes: 33 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
from utils import assert_column_eq


def test_to_floats():
typ = pa.float32()
arr = pa.array(["-1.23", "1", None])
result = plc.strings.convert.convert_floats.to_floats(
plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
)
expected = arr.cast(typ)
assert_column_eq(result, expected)


def test_from_floats():
arr = pa.array([-1.23, 1, None])
result = plc.strings.convert.convert_floats.from_floats(
plc.interop.from_arrow(arr),
)
expected = pa.array(["-1.23", "1.0", None])
assert_column_eq(result, expected)


def test_is_float():
arr = pa.array(["-1.23", "1", "1.2.3", "A", None])
result = plc.strings.convert.convert_floats.is_float(
plc.interop.from_arrow(arr),
)
expected = pa.array([True, True, False, False, None])
assert_column_eq(result, expected)

0 comments on commit 910241e

Please sign in to comment.