From bd51a25ea6fdab6ab11e95e2c8192ed7eee43e75 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:05:05 -0400
Subject: [PATCH 1/3] [DOC] Document limitation using  `cudf.pandas` proxy
 arrays (#16955)

When instantiating a `cudf.pandas` proxy array, a DtoH transfer occurs so that the data buffer is set correctly. We do this because functions which utilize NumPy's C API can utilize the data buffer directly instead of going through `__array__`. This PR documents this limitation.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16955
---
 docs/cudf/source/cudf_pandas/faq.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 34b657488c1..5024747227e 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of:
    ```
 - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version
   24.02 of cudf was the last to support pandas 1.5.x.
+- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy
+  array, we create a proxy type that actually subclasses `numpy.ndarray`. We can
+  verify this with an isinstance check.
+
+  ```python
+  %load_ext cudf.pandas
+  import pandas as pd
+  import numpy as np
+
+  arr = pd.Series([1, 1, 2]).unique() # returns a proxy array
+  isinstance(arr, np.ndarray) # returns True, where arr is a proxy array
+  ```
+  Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to
+  access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API.
+  However, our proxy mechanism is designed to proxy function calls at the Python
+  level, which is incompatible with these types of accesses. To handle these
+  situations, we perform an eager device-to-host (DtoH) copy, which sets the data
+  buffer correctly but incurs the cost of extra time when creating the proxy array.
+  In the previous example, creating `arr` performed this kind of implicit DtoH transfer.
+
+  With this approach, we also get compatibility with third party libraries like `torch`.
+
+  ```python
+  import torch
+  x = torch.from_numpy(arr)
+  ```
 
 ## Can I force running on the CPU?
 

From c7b51195c675af47d0f3dd69c04d0fcc6920eca5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 9 Oct 2024 15:17:32 -0700
Subject: [PATCH 2/3] Fix `host_span` constructor to correctly copy
 `is_device_accessible` (#17020)

One of the `host_span` constructors was not updated when we added `is_device_accessible`, so the value was not assigned.
This PR fixes this simple error and adds tests that checks that this property is correctly set when creating `host_span`s.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17020
---
 cpp/include/cudf/utilities/span.hpp           |  2 +-
 .../utilities_tests/pinned_memory_tests.cpp   | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 914731ea417..f3e1a61d075 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
   constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
-    : base(other.data(), other.size())
+    : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
 
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index ae7c6fa8b8c..1e1e21fe18a 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
@@ -125,3 +126,22 @@ TEST_F(PinnedMemoryTest, MakeHostVector)
     EXPECT_FALSE(vec.get_allocator().is_device_accessible());
   }
 }
+
+TEST_F(PinnedMemoryTest, HostSpan)
+{
+  auto test_ctors = [](auto&& vec) {
+    auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible();
+    // Test conversion from a vector
+    auto const span = cudf::host_span<int16_t>{vec};
+    EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible);
+    // Test conversion from host_span with different type
+    auto const span_converted = cudf::host_span<int16_t const>{span};
+    EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible);
+  };
+
+  cudf::set_allocate_host_as_pinned_threshold(7);
+  for (int i = 1; i < 10; i++) {
+    // some iterations will use pinned memory, some will not
+    test_ctors(cudf::detail::make_host_vector<int16_t>(i, cudf::get_default_stream()));
+  }
+}

From 3791c8a9d1aeb7474bb9ef324a089a569183406c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 9 Oct 2024 13:45:02 -1000
Subject: [PATCH 3/3] Add string.convert_floats APIs to pylibcudf (#16990)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16990
---
 python/cudf/cudf/_lib/string_casting.pyx      |  34 ++----
 .../_lib/strings/convert/convert_floats.pyx   |  24 ++---
 .../strings/convert/convert_floats.pxd        |   6 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |   2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |   1 +
 .../pylibcudf/strings/convert/__init__.py     |   1 +
 .../strings/convert/convert_floats.pxd        |  11 ++
 .../strings/convert/convert_floats.pyx        | 101 ++++++++++++++++++
 .../tests/test_string_convert_floats.py       |  33 ++++++
 9 files changed, 165 insertions(+), 48 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index d9595f4ab0a..93b67bd4c9d 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -10,10 +10,6 @@ from libcpp.utility cimport move
 
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    from_floats as cpp_from_floats,
-    to_floats as cpp_to_floats,
-)
 from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
@@ -33,32 +29,18 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 
 def floating_to_string(Column input_col):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_floats(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.from_floats(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def string_to_floating(Column input_col, object out_type):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
-        )
+    plc_column = plc.strings.convert.convert_floats.to_floats(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(out_type)
     )
-    cdef data_type c_out_type = data_type(tid)
-    with nogil:
-        c_result = move(
-            cpp_to_floats(
-                input_column_view,
-                c_out_type))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 def dtos(Column input_col):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index 7965b588703..5da6e3f10cc 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    is_float as cpp_is_float,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def is_float(Column source_strings):
@@ -20,12 +13,7 @@ def is_float(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that have floats.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.is_float(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index f4fc4674506..a45c7f9979e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_floats(
-        column_view input_col,
+        column_view strings,
         data_type output_type) except +
 
     cdef unique_ptr[column] from_floats(
-        column_view input_col) except +
+        column_view floats) except +
 
     cdef unique_ptr[column] is_float(
-        column_view source_strings
+        column_view input
     ) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 41aeb72039b..7b228c06a18 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx
+                   convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index b4b0b521e39..be6145384ad 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -4,6 +4,7 @@ from . cimport (
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index 409620fce45..7c94387282b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -4,6 +4,7 @@
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
new file mode 100644
index 00000000000..1284ff552aa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type)
+
+cpdef Column from_floats(Column floats)
+
+cpdef Column is_float(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..8081aadb085
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -0,0 +1,101 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_floats as cpp_convert_floats,
+)
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type):
+    """
+    Returns a new numeric column by parsing float values from each string
+    in the provided strings column.
+
+    For details, see cpp:func:`cudf::strings::to_floats`
+
+    Parameters
+    ----------
+    strings : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of float numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with floats converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.to_floats(
+                strings.view(),
+                output_type.c_obj,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column from_floats(Column floats):
+    """
+    Returns a new strings column converting the float values from the
+    provided column into strings.
+
+    For details, see cpp:func:`cudf::strings::from_floats`
+
+    Parameters
+    ----------
+    floats : Column
+        Numeric column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column with floats as strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.from_floats(
+                floats.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_float(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to floats.
+
+    For details, see cpp:func:`cudf::strings::is_float`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.is_float(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
new file mode 100644
index 00000000000..e9918fab559
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_to_floats():
+    typ = pa.float32()
+    arr = pa.array(["-1.23", "1", None])
+    result = plc.strings.convert.convert_floats.to_floats(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_floats():
+    arr = pa.array([-1.23, 1, None])
+    result = plc.strings.convert.convert_floats.from_floats(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array(["-1.23", "1.0", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_float():
+    arr = pa.array(["-1.23", "1", "1.2.3", "A", None])
+    result = plc.strings.convert.convert_floats.is_float(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)