Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.10' into dask-cudf-a…
Browse files Browse the repository at this point in the history
…rrow-filesystem
  • Loading branch information
rjzamora committed Sep 19, 2024
2 parents e391789 + 272a703 commit e154d01
Show file tree
Hide file tree
Showing 30 changed files with 650 additions and 219 deletions.
14 changes: 13 additions & 1 deletion ci/cudf_pandas_scripts/pandas-tests/job-summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,18 @@ def emoji_failed(x):
pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
diff_df = pr_df - main_df
total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)

pr_df = pr_df[["total", "passed", "failed", "skipped"]]
cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)

# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'

pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
diff_df = diff_df[["total", "passed", "failed", "skipped"]]
diff_df.columns = diff_df.columns + "_diff"
diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
Expand All @@ -95,6 +105,8 @@ def emoji_failed(x):

print(comment)
print()
print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
print()
print("Here are the results of running the Pandas tests against this PR:")
print()
print(df.to_markdown())
8 changes: 4 additions & 4 deletions cpp/include/cudf/detail/aggregation/aggregation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
*
* @tparam F Type of callable
* @param k The `aggregation::Kind` value to dispatch
* aram f The callable that accepts an `aggregation::Kind` non-type template
* argument.
* @param f The callable that accepts an `aggregation::Kind` callable function object.
* @param args Parameter pack forwarded to the `operator()` invocation
* @return Forwards the return value of the callable.
*/
Expand Down Expand Up @@ -1626,6 +1625,7 @@ struct dispatch_source {
* parameter of the callable `F`
* @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind`
* non-type template parameter for the second template parameter of the callable
* @param f The callable that accepts `data_type` and `aggregation::Kind` function object.
* @param args Parameter pack forwarded to the `operator()` invocation
* `F`.
*/
Expand All @@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d
* @brief Returns the target `data_type` for the specified aggregation k
* performed on elements of type source_type.
*
* aram source_type The element type to be aggregated
* aram k The aggregation
* @param source_type The element type to be aggregated
* @param k The aggregation kind
* @return data_type The target_type of k performed on source_type
* elements
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
extract
=======

.. automodule:: pylibcudf.strings.extract
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ strings
capitalize
char_types
contains
extract
find
regex_flags
regex_program
repeat
replace
slice
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
======
repeat
======

.. automodule:: pylibcudf.strings.repeat
:members:
80 changes: 13 additions & 67 deletions python/cudf/cudf/_lib/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -1,27 +1,10 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libc.stdint cimport uint32_t

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.contains cimport (
count_re as cpp_count_re,
like as cpp_like,
matches_re as cpp_matches_re,
)
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf.strings import contains
from pylibcudf.strings.regex_program import RegexProgram
Expand All @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
Returns a Column with count of occurrences of `reg_ex` in
each string of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_count_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.count_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand All @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
Returns a Column with each value True if the string matches `reg_ex`
regular expression with each record of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_matches_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand All @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape):
Returns a Column with each value True if the string matches the
`py_pattern` like expression with each record of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef DeviceScalar pattern = py_pattern.device_value
cdef DeviceScalar escape = py_escape.device_value

cdef const string_scalar* scalar_ptn = <const string_scalar*>(
pattern.get_raw_ptr()
)
cdef const string_scalar* scalar_esc = <const string_scalar*>(
escape.get_raw_ptr()
plc_column = contains.like(
source_strings.to_pylibcudf(mode="read"),
py_pattern.device_value.c_value,
py_escape.device_value.c_value,
)

with nogil:
c_result = move(cpp_like(
source_view,
scalar_ptn[0],
scalar_esc[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)
34 changes: 6 additions & 28 deletions python/cudf/cudf/_lib/strings/extract.pyx
Original file line number Diff line number Diff line change
@@ -1,21 +1,12 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libc.stdint cimport uint32_t
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program
from pylibcudf.libcudf.table.table cimport table

from cudf._lib.column cimport Column
from cudf._lib.utils cimport data_from_unique_ptr

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags):
The returning data contains one row for each subject string,
and one column for each group.
"""
cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()

cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_extract(
source_view,
dereference(c_prog)
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags)
plc_result = plc.strings.extract.extract(
source_strings.to_pylibcudf(mode="read"), prog
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns()))
40 changes: 12 additions & 28 deletions python/cudf/cudf/_lib/strings/repeat.pyx
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column

import pylibcudf as plc


@acquire_spill_lock()
def repeat_scalar(Column source_strings,
Expand All @@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings,
each string in `source_strings`
`repeats` number of times.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_repeat.repeat_strings(
source_view,
repeats
))

return Column.from_unique_ptr(move(c_result))
plc_result = plc.strings.repeat.repeat_strings(
source_strings.to_pylibcudf(mode="read"),
repeats
)
return Column.from_pylibcudf(plc_result)


@acquire_spill_lock()
Expand All @@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings,
each string in `source_strings`
`repeats` number of times.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view repeats_view = repeats.view()

with nogil:
c_result = move(cpp_repeat.repeat_strings(
source_view,
repeats_view
))

return Column.from_unique_ptr(move(c_result))
plc_result = plc.strings.repeat.repeat_strings(
source_strings.to_pylibcudf(mode="read"),
repeats.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_result)
6 changes: 2 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,11 +623,9 @@ def extract(
"unsupported value for `flags` parameter"
)

data, _ = libstrings.extract(self._column, pat, flags)
data = libstrings.extract(self._column, pat, flags)
if len(data) == 1 and expand is False:
data = next(iter(data.values()))
else:
data = data
_, data = data.popitem()
return self._return_or_inplace(data, expand=expand)

def contains(
Expand Down
Loading

0 comments on commit e154d01

Please sign in to comment.