Merge remote-tracking branch 'upstream/branch-24.12' into pylibcudf/s…

…trings/convert_lists
rapidsai · Oct 9, 2024 · 910241e · 910241e
2 parents edfaa92 + 3791c8a
commit 910241e
Show file tree

Hide file tree

Showing 12 changed files with 214 additions and 50 deletions.
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
@@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
   constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
-    : base(other.data(), other.size())
+    : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
 

diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
@@ -125,3 +126,22 @@ TEST_F(PinnedMemoryTest, MakeHostVector)
     EXPECT_FALSE(vec.get_allocator().is_device_accessible());
   }
 }
+
+TEST_F(PinnedMemoryTest, HostSpan)
+{
+  auto test_ctors = [](auto&& vec) {
+    auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible();
+    // Test conversion from a vector
+    auto const span = cudf::host_span<int16_t>{vec};
+    EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible);
+    // Test conversion from host_span with different type
+    auto const span_converted = cudf::host_span<int16_t const>{span};
+    EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible);
+  };
+
+  cudf::set_allocate_host_as_pinned_threshold(7);
+  for (int i = 1; i < 10; i++) {
+    // some iterations will use pinned memory, some will not
+    test_ctors(cudf::detail::make_host_vector<int16_t>(i, cudf::get_default_stream()));
+  }
+}
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
@@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of:
    ```
 - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version
   24.02 of cudf was the last to support pandas 1.5.x.
+- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy
+  array, we create a proxy type that actually subclasses `numpy.ndarray`. We can
+  verify this with an isinstance check.
+
+  ```python
+  %load_ext cudf.pandas
+  import pandas as pd
+  import numpy as np
+
+  arr = pd.Series([1, 1, 2]).unique() # returns a proxy array
+  isinstance(arr, np.ndarray) # returns True, where arr is a proxy array
+  ```
+  Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to
+  access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API.
+  However, our proxy mechanism is designed to proxy function calls at the Python
+  level, which is incompatible with these types of accesses. To handle these
+  situations, we perform an eager device-to-host (DtoH) copy, which sets the data
+  buffer correctly but incurs the cost of extra time when creating the proxy array.
+  In the previous example, creating `arr` performed this kind of implicit DtoH transfer.
+
+  With this approach, we also get compatibility with third party libraries like `torch`.
+
+  ```python
+  import torch
+  x = torch.from_numpy(arr)
+  ```
 
 ## Can I force running on the CPU?
 

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
@@ -10,10 +10,6 @@ from libcpp.utility cimport move
 
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    from_floats as cpp_from_floats,
-    to_floats as cpp_to_floats,
-)
 from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
@@ -33,32 +29,18 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 
 def floating_to_string(Column input_col):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_floats(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.from_floats(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def string_to_floating(Column input_col, object out_type):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
-        )
+    plc_column = plc.strings.convert.convert_floats.to_floats(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(out_type)
     )
-    cdef data_type c_out_type = data_type(tid)
-    with nogil:
-        c_result = move(
-            cpp_to_floats(
-                input_column_view,
-                c_out_type))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 def dtos(Column input_col):

diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,31 +1,19 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    is_float as cpp_is_float,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def is_float(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
     that have floats.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.is_float(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_floats(
-        column_view input_col,
+        column_view strings,
         data_type output_type) except +
 
     cdef unique_ptr[column] from_floats(
-        column_view input_col) except +
+        column_view floats) except +
 
     cdef unique_ptr[column] is_float(
-        column_view source_strings
+        column_view input
     ) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
+set(cython_sources
+    convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
+    convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)

diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -4,6 +4,7 @@ from . cimport (
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_lists,
     convert_urls,

diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -4,6 +4,7 @@
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_lists,
     convert_urls,

diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type)
+
+cpdef Column from_floats(Column floats)
+
+cpdef Column is_float(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -0,0 +1,101 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_floats as cpp_convert_floats,
+)
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type):
+    """
+    Returns a new numeric column by parsing float values from each string
+    in the provided strings column.
+
+    For details, see cpp:func:`cudf::strings::to_floats`
+
+    Parameters
+    ----------
+    strings : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of float numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with floats converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.to_floats(
+                strings.view(),
+                output_type.c_obj,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column from_floats(Column floats):
+    """
+    Returns a new strings column converting the float values from the
+    provided column into strings.
+
+    For details, see cpp:func:`cudf::strings::from_floats`
+
+    Parameters
+    ----------
+    floats : Column
+        Numeric column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column with floats as strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.from_floats(
+                floats.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_float(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to floats.
+
+    For details, see cpp:func:`cudf::strings::is_float`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.is_float(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_to_floats():
+    typ = pa.float32()
+    arr = pa.array(["-1.23", "1", None])
+    result = plc.strings.convert.convert_floats.to_floats(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_floats():
+    arr = pa.array([-1.23, 1, None])
+    result = plc.strings.convert.convert_floats.from_floats(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array(["-1.23", "1.0", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_float():
+    arr = pa.array(["-1.23", "1", "1.2.3", "A", None])
+    result = plc.strings.convert.convert_floats.is_float(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)