-
Notifications
You must be signed in to change notification settings - Fork 912
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add string.extract APIs to pylibcudf (#16823)
Contributes to #15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #16823
- Loading branch information
Showing
11 changed files
with
149 additions
and
36 deletions.
There are no files selected for viewing
6 changes: 6 additions & 0 deletions
6
docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
======= | ||
extract | ||
======= | ||
|
||
.. automodule:: pylibcudf.strings.extract | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ strings | |
capitalize | ||
char_types | ||
contains | ||
extract | ||
find | ||
regex_flags | ||
regex_program | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ from . cimport ( | |
case, | ||
char_types, | ||
contains, | ||
extract, | ||
find, | ||
regex_flags, | ||
regex_program, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
case, | ||
char_types, | ||
contains, | ||
extract, | ||
find, | ||
regex_flags, | ||
regex_program, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from pylibcudf.column cimport Column | ||
from pylibcudf.strings.regex_program cimport RegexProgram | ||
from pylibcudf.table cimport Table | ||
|
||
|
||
cpdef Table extract(Column input, RegexProgram prog) | ||
|
||
cpdef Column extract_all_record(Column input, RegexProgram prog) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libcpp.memory cimport unique_ptr | ||
from libcpp.utility cimport move | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.column.column cimport column | ||
from pylibcudf.libcudf.strings cimport extract as cpp_extract | ||
from pylibcudf.libcudf.table.table cimport table | ||
from pylibcudf.strings.regex_program cimport RegexProgram | ||
from pylibcudf.table cimport Table | ||
|
||
|
||
cpdef Table extract(Column input, RegexProgram prog): | ||
""" | ||
Returns a table of strings columns where each column | ||
corresponds to the matching group specified in the given | ||
egex_program object. | ||
For details, see :cpp:func:`cudf::strings::extract`. | ||
Parameters | ||
---------- | ||
input : Column | ||
Strings instance for this operation | ||
prog : RegexProgram | ||
Regex program instance | ||
Returns | ||
------- | ||
Table | ||
Columns of strings extracted from the input column. | ||
""" | ||
cdef unique_ptr[table] c_result | ||
|
||
with nogil: | ||
c_result = move( | ||
cpp_extract.extract( | ||
input.view(), | ||
prog.c_obj.get()[0] | ||
) | ||
) | ||
|
||
return Table.from_libcudf(move(c_result)) | ||
|
||
|
||
cpdef Column extract_all_record(Column input, RegexProgram prog): | ||
""" | ||
Returns a lists column of strings where each string column | ||
row corresponds to the matching group specified in the given | ||
regex_program object. | ||
For details, see :cpp:func:`cudf::strings::extract_all_record`. | ||
Parameters | ||
---------- | ||
input : Column | ||
Strings instance for this operation | ||
prog : RegexProgram | ||
Regex program instance | ||
Returns | ||
------- | ||
Column | ||
Lists column containing strings extracted from the input column | ||
""" | ||
cdef unique_ptr[column] c_result | ||
|
||
with nogil: | ||
c_result = move( | ||
cpp_extract.extract_all_record( | ||
input.view(), | ||
prog.c_obj.get()[0] | ||
) | ||
) | ||
|
||
return Column.from_libcudf(move(c_result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pyarrow.compute as pc | ||
import pylibcudf as plc | ||
|
||
|
||
def test_extract(): | ||
pattern = "([ab])(\\d)" | ||
pa_pattern = "(?P<letter>[ab])(?P<digit>\\d)" | ||
arr = pa.array(["a1", "b2", "c3"]) | ||
plc_result = plc.strings.extract.extract( | ||
plc.interop.from_arrow(arr), | ||
plc.strings.regex_program.RegexProgram.create( | ||
pattern, plc.strings.regex_flags.RegexFlags.DEFAULT | ||
), | ||
) | ||
result = plc.interop.to_arrow(plc_result) | ||
expected = pc.extract_regex(arr, pa_pattern) | ||
for i, result_col in enumerate(result.itercolumns()): | ||
expected_col = pa.chunked_array(expected.field(i)) | ||
assert result_col.fill_null("").equals(expected_col) | ||
|
||
|
||
def test_extract_all_record(): | ||
pattern = "([ab])(\\d)" | ||
arr = pa.array(["a1", "b2", "c3"]) | ||
plc_result = plc.strings.extract.extract_all_record( | ||
plc.interop.from_arrow(arr), | ||
plc.strings.regex_program.RegexProgram.create( | ||
pattern, plc.strings.regex_flags.RegexFlags.DEFAULT | ||
), | ||
) | ||
result = plc.interop.to_arrow(plc_result) | ||
expected = pa.chunked_array( | ||
[pa.array([["a", "1"], ["b", "2"], None], type=result.type)] | ||
) | ||
assert result.equals(expected) |