From 6e378d65c605f282b3b88fa68ec7aa062732c791 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:14:42 -0700 Subject: [PATCH 1/2] Add string.convert.convert_ipv4 APIs to pylibcudf --- python/cudf/cudf/_lib/string_casting.pyx | 42 +++------ .../libcudf/strings/convert/convert_ipv4.pxd | 6 +- .../pylibcudf/strings/convert/CMakeLists.txt | 2 +- .../pylibcudf/strings/convert/__init__.pxd | 2 +- .../pylibcudf/strings/convert/__init__.py | 2 +- .../strings/convert/convert_ipv4.pxd | 10 ++ .../strings/convert/convert_ipv4.pyx | 92 +++++++++++++++++++ .../tests/test_string_convert_ipv4.py | 31 +++++++ 8 files changed, 151 insertions(+), 36 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..53de3ad1baa 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -33,11 +33,6 @@ from pylibcudf.libcudf.strings.convert.convert_integers cimport ( is_hex as cpp_is_hex, to_integers as cpp_to_integers, ) -from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( - integers_to_ipv4 as cpp_integers_to_ipv4, - ipv4_to_integers as cpp_ipv4_to_integers, - is_ipv4 as cpp_is_ipv4, -) from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id @@ -646,14 +641,10 @@ def int2ip(Column input_col): A Column with integer represented in string ipv4 format """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_ipv4(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def ip2int(Column input_col): @@ -669,14 +660,10 @@ def ip2int(Column input_col): A Column with ipv4 represented as integer """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ipv4_to_integers(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def is_ipv4(Column source_strings): @@ -685,15 +672,10 @@ def is_ipv4(Column source_strings): that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn where nnn is integer digits in [0,255]. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_ipv4( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.is_ipv4( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def htoi(Column input_col, **kwargs): diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd index fe571cfced6..801db438e92 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd @@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] ipv4_to_integers( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] integers_to_ipv4( - column_view input_col) except + + column_view integers) except + cdef unique_ptr[column] is_ipv4( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..46c29e420de 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_durations.pyx convert_datetime.pyx convert_ipv4.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..78f57c68302 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport convert_datetime, convert_durations, convert_ipv4 diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..03b98a117a9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import convert_datetime, convert_durations, convert_ipv4 diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd new file mode 100644 index 00000000000..c61f5c0bdca --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column ipv4_to_integers(Column input) + +cpdef Column integers_to_ipv4(Column integers) + +cpdef Column is_ipv4(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx new file mode 100644 index 00000000000..835edfb9340 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 + + +cpdef Column ipv4_to_integers(Column input): + """ + Converts IPv4 addresses into integers. + + For details, see cpp:func:`cudf::strings::ipv4_to_integers` + + Parameters + ---------- + input : Column + Strings instance for this operation + + Returns + ------- + Column + New int32 column converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.ipv4_to_integers( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_ipv4(Column integers): + """ + Converts integers into IPv4 addresses as strings. + + For details, see cpp:func:`cudf::strings::integers_to_ipv4` + + Parameters + ---------- + integers : Column + Integer (uint32) column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.integers_to_ipv4( + integers.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_ipv4(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from IPv4 format. + + For details, see cpp:func:`cudf::strings::is_ipv4` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.is_ipv4( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py new file mode 100644 index 00000000000..4dc3e512624 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_ipv4_to_integers(): + arr = pa.array(["123.45.67.890", None]) + result = plc.strings.convert.convert_ipv4.ipv4_to_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array([2066564730, None], type=pa.uint32()) + assert_column_eq(result, expected) + + +def test_integers_to_ipv4(): + arr = pa.array([1, 0, None], type=pa.uint32()) + result = plc.strings.convert.convert_ipv4.integers_to_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["0.0.0.1", "0.0.0.0", None]) + assert_column_eq(result, expected) + + +def test_is_ipv4(): + arr = pa.array(["0.0.0.1", "1.2.34", "A", None]) + result = plc.strings.convert.convert_ipv4.is_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, False, False, None]) + assert_column_eq(result, expected) From f860cfbdc0f6b3c646d86692bfb45e8913ccce31 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:14:20 -0700 Subject: [PATCH 2/2] Correct return integer type --- python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx index 835edfb9340..f2a980d4269 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -21,7 +21,7 @@ cpdef Column ipv4_to_integers(Column input): Returns ------- Column - New int32 column converted from strings. + New uint32 column converted from strings. """ cdef unique_ptr[column] c_result