diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index 895ba83ee54..e0ea775aad5 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -2,3 +2,4 @@ # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ enabled: true +auto_sync_draft: false diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml new file mode 100644 index 00000000000..3b972f31ca4 --- /dev/null +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -0,0 +1,26 @@ +name: Trigger Breaking Change Notifications + +on: + pull_request_target: + types: + - closed + - reopened + - labeled + - unlabeled + +jobs: + trigger-notifier: + if: contains(github.event.pull_request.labels.*.name, 'breaking') + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12 + with: + sender_login: ${{ github.event.sender.login }} + sender_avatar: ${{ github.event.sender.avatar_url }} + repo: ${{ github.repository }} + pr_number: ${{ github.event.pull_request.number }} + pr_title: "${{ github.event.pull_request.title }}" + pr_body: "${{ github.event.pull_request.body || '_Empty PR description_' }}" + pr_base_ref: ${{ github.event.pull_request.base.ref }} + pr_author: ${{ github.event.pull_request.user.login }} + event_action: ${{ github.event.action }} + pr_merged: ${{ github.event.pull_request.merged }} diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 8156258c810..a4885d59cc5 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -244,6 +244,7 @@ struct validity_fn { * * @param strings_columns Table of strings columns * @param column_names Column of names for each column in the table + * @param num_rows Number of rows in the table * @param row_prefix Prepend this string to each row * @param row_suffix Append this string to each row * @param value_separator Separator between values @@ -255,6 +256,7 @@ struct validity_fn { */ std::unique_ptr struct_to_strings(table_view const& strings_columns, column_view const& column_names, + size_type const num_rows, string_view const row_prefix, string_view const row_suffix, string_view const value_separator, @@ -268,8 +270,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, auto const num_columns = strings_columns.num_columns(); CUDF_EXPECTS(num_columns == column_names.size(), "Number of column names should be equal to number of columns in the table"); - auto const strings_count = strings_columns.num_rows(); - if (strings_count == 0) // empty begets empty + if (num_rows == 0) // empty begets empty return make_empty_column(type_id::STRING); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), @@ -277,31 +278,46 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, [](auto const& c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); auto constexpr strviews_per_column = 3; // (for each "column_name:", "value", "separator") - auto const num_strviews_per_row = strings_columns.num_columns() * strviews_per_column + 1; + auto const num_strviews_per_row = strings_columns.num_columns() == 0 + ? 2 + : (1 + strings_columns.num_columns() * strviews_per_column); // e.g. {col1: value, col2: value, col3: value} = 1 + 3 + 3 + (3-1) + 1 = 10 auto tbl_device_view = cudf::table_device_view::create(strings_columns, stream); auto d_column_names = column_device_view::create(column_names, stream); // Note for future: chunk it but maximize parallelism, if memory usage is high. - auto const total_strings = num_strviews_per_row * strings_columns.num_rows(); - auto const total_rows = strings_columns.num_rows() * strings_columns.num_columns(); + auto const total_strings = num_strviews_per_row * num_rows; + auto const total_rows = num_rows * strings_columns.num_columns(); rmm::device_uvector d_strviews(total_strings, stream); - struct_scatter_strings_fn scatter_fn{*tbl_device_view, - *d_column_names, - strviews_per_column, - num_strviews_per_row, - row_prefix, - row_suffix, - value_separator, - narep.value(stream), - include_nulls, - d_strviews.begin()}; - // scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(total_rows), - scatter_fn); + if (strings_columns.num_columns() > 0) { + struct_scatter_strings_fn scatter_fn{*tbl_device_view, + *d_column_names, + strviews_per_column, + num_strviews_per_row, + row_prefix, + row_suffix, + value_separator, + narep.value(stream), + include_nulls, + d_strviews.begin()}; + // scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(total_rows), + scatter_fn); + } else { + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [d_strviews = d_strviews.begin(), row_prefix, row_suffix, num_strviews_per_row] __device__( + auto idx) { + auto const this_index = idx * num_strviews_per_row; + d_strviews[this_index] = row_prefix; + d_strviews[this_index + num_strviews_per_row - 1] = row_suffix; + }); + } if (!include_nulls) { // if previous column was null, then we skip the value separator rmm::device_uvector d_str_separator(total_rows, stream); @@ -341,7 +357,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, // gather from offset and create a new string column auto old_offsets = strings_column_view(joined_col->view()).offsets(); - rmm::device_uvector row_string_offsets(strings_columns.num_rows() + 1, stream, mr); + rmm::device_uvector row_string_offsets(num_rows + 1, stream, mr); auto const d_strview_offsets = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type([num_strviews_per_row] __device__(size_type const i) { return i * num_strviews_per_row; @@ -353,7 +369,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, row_string_offsets.begin()); auto chars_data = joined_col->release().data; return make_strings_column( - strings_columns.num_rows(), + num_rows, std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), std::move(chars_data.release()[0]), 0, @@ -677,6 +693,7 @@ struct column_to_strings_fn { auto col_string = operator()(child_it, child_it + column.num_children(), children_names, + column.size(), struct_row_end_wrap.value(stream_)); col_string->set_null_mask(cudf::detail::copy_bitmask(column, stream_, mr_), column.null_count()); @@ -688,6 +705,7 @@ struct column_to_strings_fn { std::unique_ptr operator()(column_iterator column_begin, column_iterator column_end, host_span children_names, + size_type num_rows, cudf::string_view const row_end_wrap_value) const { auto const num_columns = std::distance(column_begin, column_end); @@ -733,6 +751,7 @@ struct column_to_strings_fn { // return struct_to_strings(str_table_view, column_names_view, + num_rows, struct_row_begin_wrap.value(stream_), row_end_wrap_value, struct_value_separator.value(stream_), @@ -908,6 +927,7 @@ void write_json_uncompressed(data_sink* out_sink, auto str_concat_col = converter(sub_view.begin(), sub_view.end(), user_column_names, + sub_view.num_rows(), d_line_terminator_with_row_end.value(stream)); // Needs line_terminator at the end, to separate from next chunk diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 573101cefd9..3645b3333b3 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -50,11 +50,6 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; -using row_hash_legacy = - cudf::row_hasher; - -using row_equality_legacy = cudf::row_equality_comparator; - bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type); } // namespace detail } // namespace cudf diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 45e0fc345b5..f7c9f6df8ad 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -24,7 +24,6 @@ set(cython_sources interop.pyx json.pyx merge.pyx - null_mask.pyx orc.pyx parquet.pyx reduce.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index c51db601985..7474c4e8cd1 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -11,7 +11,6 @@ interop, json, merge, - null_mask, nvtext, orc, parquet, diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 94dbdf5534d..9cbe11d61ac 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -11,7 +11,6 @@ import pylibcudf import rmm import cudf -import cudf._lib as libcudf from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, @@ -36,7 +35,6 @@ from cudf._lib.types cimport ( dtype_to_pylibcudf_type, ) -from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column cimport pylibcudf.libcudf.copying as cpp_copying @@ -159,7 +157,10 @@ cdef class Column: if self.base_mask is None or self.offset == 0: self._mask = self.base_mask else: - self._mask = libcudf.null_mask.copy_bitmask(self) + with acquire_spill_lock(): + self._mask = as_buffer( + pylibcudf.null_mask.copy_bitmask(self.to_pylibcudf(mode="read")) + ) return self._mask @property @@ -183,7 +184,9 @@ cdef class Column: if value is not None: # bitmask size must be relative to offset = 0 data. - required_size = bitmask_allocation_size_bytes(self.base_size) + required_size = pylibcudf.null_mask.bitmask_allocation_size_bytes( + self.base_size + ) if value.size < required_size: error_msg = ( "The Buffer for mask is smaller than expected, " @@ -220,7 +223,7 @@ cdef class Column: and compute new data Buffers zero-copy that use pointer arithmetic to properly adjust the pointer. """ - mask_size = bitmask_allocation_size_bytes(self.size) + mask_size = pylibcudf.null_mask.bitmask_allocation_size_bytes(self.size) required_num_bytes = -(-self.size // 8) # ceiling divide error_msg = ( "The value for mask is smaller than expected, got {} bytes, " @@ -790,13 +793,17 @@ cdef class Column: mask = as_buffer( rmm.DeviceBuffer( ptr=mask_ptr, - size=bitmask_allocation_size_bytes(base_size) + size=pylibcudf.null_mask.bitmask_allocation_size_bytes( + base_size + ) ) ) else: mask = as_buffer( data=mask_ptr, - size=bitmask_allocation_size_bytes(base_size), + size=pylibcudf.null_mask.bitmask_allocation_size_bytes( + base_size + ), owner=mask_owner, exposed=True ) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 59a970263e0..641fc18c203 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -202,46 +202,71 @@ def read_csv( raise ValueError( "dtype should be a scalar/str/list-like/dict-like" ) + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) + .compression(c_compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range[0]) + .byte_range_size(byte_range[1]) + .nrows(nrows if nrows is not None else -1) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(str(lineterminator)) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() + ) + + options.set_header(header) + + if names is not None: + options.set_names([str(name) for name in names]) + + if prefix is not None: + options.set_prefix(prefix) + + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + + if delimiter is not None: + options.set_delimiter(delimiter) + + if thousands is not None: + options.set_thousands(thousands) - lineterminator = str(lineterminator) + if comment is not None: + options.set_comment(comment) + + if parse_dates is not None: + options.set_parse_dates(list(parse_dates)) + + if hex_cols is not None: + options.set_parse_hex(list(hex_cols)) + + options.set_dtypes(new_dtypes) + + if true_values is not None: + options.set_true_values([str(val) for val in true_values]) + + if false_values is not None: + options.set_false_values([str(val) for val in false_values]) + + if na_values is not None: + options.set_na_values([str(val) for val in na_values]) df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io( - plc.io.csv.read_csv( - plc.io.SourceInfo([datasource]), - lineterminator=lineterminator, - quotechar = quotechar, - quoting = quoting, - doublequote = doublequote, - header = header, - mangle_dupe_cols = mangle_dupe_cols, - usecols = usecols, - delimiter = delimiter, - delim_whitespace = delim_whitespace, - skipinitialspace = skipinitialspace, - col_names = names, - dtypes = new_dtypes, - skipfooter = skipfooter, - skiprows = skiprows, - dayfirst = dayfirst, - compression = c_compression, - thousands = thousands, - decimal = decimal, - true_values = true_values, - false_values = false_values, - nrows = nrows if nrows is not None else -1, - byte_range_offset = byte_range[0], - byte_range_size = byte_range[1], - skip_blank_lines = skip_blank_lines, - parse_dates = parse_dates, - parse_hex = hex_cols, - comment = comment, - na_values = na_values, - keep_default_na = keep_default_na, - na_filter = na_filter, - prefix = prefix, - ) - ) + *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) ) if dtype is not None: diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx deleted file mode 100644 index d54e8e66281..00000000000 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf -from pylibcudf.null_mask import MaskState - -from cudf.core.buffer import acquire_spill_lock, as_buffer - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def copy_bitmask(Column col): - """ - Copies column's validity mask buffer into a new buffer, shifting by the - offset if nonzero - """ - if col.base_mask is None: - return None - - rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read")) - buf = as_buffer(rmm_db) - return buf - - -def bitmask_allocation_size_bytes(num_bits): - """ - Given a size, calculates the number of bytes that should be allocated for a - column validity mask - """ - return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits) - - -def create_null_mask(size, state=MaskState.UNINITIALIZED): - """ - Given a size and a mask state, allocate a mask that can properly represent - the given size with the given mask state - - Parameters - ---------- - size : int - Number of elements the mask needs to be able to represent - state : ``MaskState``, default ``MaskState.UNINITIALIZED`` - State the null mask should be created in - """ - rmm_db = pylibcudf.null_mask.create_null_mask(size, state) - buf = as_buffer(rmm_db) - return buf - - -@acquire_spill_lock() -def bitmask_and(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_and( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other - - -@acquire_spill_lock() -def bitmask_or(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_or( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx deleted file mode 100644 index 509cfe5e9f8..00000000000 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool -from libcpp.vector cimport vector - -from cudf._lib.column cimport Column - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf as plc - - -@acquire_spill_lock() -def quantile( - Column input, - vector[double] q, - str interp, - Column ordered_indices, - bool exact, -): - return Column.from_pylibcudf( - plc.quantiles.quantile( - input.to_pylibcudf(mode="read"), - q, - plc.types.Interpolation[interp.upper()], - ordered_indices.to_pylibcudf(mode="read"), - exact - ) - ) - - -def quantile_table( - list source_columns, - vector[double] q, - object interp, - object is_input_sorted, - list column_order, - list null_precedence, -): - return columns_from_pylibcudf_table( - plc.quantiles.quantiles( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]), - q, - interp, - is_input_sorted, - column_order, - null_precedence - ) - ) diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index ceeff71683c..dca9c4cc3fc 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -11,35 +11,5 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= - -set(cython_sources - attributes.pyx - capitalize.pyx - case.pyx - char_types.pyx - combine.pyx - contains.pyx - extract.pyx - find.pyx - find_multiple.pyx - findall.pyx - json.pyx - padding.pyx - repeat.pyx - replace.pyx - replace_re.pyx - strip.pyx - substring.pyx - translate.pyx - wrap.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) - add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 4c0ec2d9ac5..b795c54c112 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -32,62 +32,10 @@ detokenize, tokenize_with_vocabulary, ) -from cudf._lib.strings.attributes import ( - code_points, - count_bytes, - count_characters, -) -from cudf._lib.strings.capitalize import capitalize, is_title, title -from cudf._lib.strings.case import swapcase, to_lower, to_upper -from cudf._lib.strings.char_types import ( - filter_alphanum, - is_alnum, - is_alpha, - is_decimal, - is_digit, - is_lower, - is_numeric, - is_space, - is_upper, -) -from cudf._lib.strings.combine import ( - concatenate, - join, - join_lists_with_column, - join_lists_with_scalar, -) -from cudf._lib.strings.contains import contains_re, count_re, like, match_re from cudf._lib.strings.convert.convert_fixed_point import to_decimal from cudf._lib.strings.convert.convert_floats import is_float from cudf._lib.strings.convert.convert_integers import is_integer from cudf._lib.strings.convert.convert_urls import url_decode, url_encode -from cudf._lib.strings.extract import extract -from cudf._lib.strings.find import ( - contains, - contains_multiple, - endswith, - endswith_multiple, - find, - rfind, - startswith, - startswith_multiple, -) -from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import find_re, findall -from cudf._lib.strings.json import get_json_object -from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill -from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence -from cudf._lib.strings.replace import ( - insert, - replace, - replace_multi, - slice_replace, -) -from cudf._lib.strings.replace_re import ( - replace_multi_re, - replace_re, - replace_with_backrefs, -) from cudf._lib.strings.split.partition import partition, rpartition from cudf._lib.strings.split.split import ( rsplit, @@ -99,7 +47,3 @@ split_record, split_record_re, ) -from cudf._lib.strings.strip import lstrip, rstrip, strip -from cudf._lib.strings.substring import get, slice_from, slice_strings -from cudf._lib.strings.translate import filter_characters, translate -from cudf._lib.strings.wrap import wrap diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx deleted file mode 100644 index df81b3942b4..00000000000 --- a/python/cudf/cudf/_lib/strings/attributes.pyx +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def count_characters(Column source_strings): - """ - Returns an integer numeric column containing the - length of each string in characters. - """ - plc_column = plc.strings.attributes.count_characters( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def count_bytes(Column source_strings): - """ - Returns an integer numeric column containing the - number of bytes of each string. - """ - plc_column = plc.strings.attributes.count_bytes( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def code_points(Column source_strings): - """ - Creates a numeric column with code point values (integers) - for each character of each string. - """ - plc_column = plc.strings.attributes.code_points( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx deleted file mode 100644 index 42c40e2e753..00000000000 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def capitalize(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.capitalize( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.title( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def is_title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.is_title( - source_strings.to_pylibcudf(mode="read") - ) - ) diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx deleted file mode 100644 index ad4cbb6f088..00000000000 --- a/python/cudf/cudf/_lib/strings/case.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import case - - -@acquire_spill_lock() -def to_upper(Column source_strings): - return Column.from_pylibcudf( - case.to_upper( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def to_lower(Column source_strings): - return Column.from_pylibcudf( - case.to_lower( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def swapcase(Column source_strings): - return Column.from_pylibcudf( - case.swapcase( - source_strings.to_pylibcudf(mode='read') - ) - ) diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx deleted file mode 100644 index a57ce29eb45..00000000000 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import char_types - - -@acquire_spill_lock() -def filter_alphanum(Column source_strings, object py_repl, bool keep=True): - """ - Returns a Column of strings keeping only alphanumeric character types. - """ - plc_column = char_types.filter_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALL_TYPES if keep - else char_types.StringCharacterTypes.ALPHANUM, - py_repl.device_value.c_value, - char_types.StringCharacterTypes.ALPHANUM if keep - else char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_decimal(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal characters -- those that can be used - to extract base10 numbers. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.DECIMAL, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_alnum(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphanumeric characters. - - Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALPHANUM, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_alpha(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphabetic characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALPHA, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_digit(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal and digit characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.DIGIT, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_numeric(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only numeric characters. These include digit and - numeric characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.NUMERIC, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_upper(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only upper-case characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.UPPER, - char_types.StringCharacterTypes.CASE_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_lower(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only lower-case characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.LOWER, - char_types.StringCharacterTypes.CASE_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_space(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contains all characters which are spaces only. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.SPACE, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx deleted file mode 100644 index 0f7b27d85d7..00000000000 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - -import cudf - - -@acquire_spill_lock() -def concatenate(list source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings column-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - plc_column = plc.strings.combine.concatenate( - plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), - sep.device_value.c_value, - na_rep.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join(Column source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings row-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - plc_column = plc.strings.combine.join_strings( - source_strings.to_pylibcudf(mode="read"), - sep.device_value.c_value, - na_rep.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join_lists_with_scalar( - Column source_strings, - object py_separator, - object py_narep): - """ - Returns a Column by concatenating Lists of strings row-wise - in `source_strings` with the specified `py_separator` - between each string in lists and ``/`None` values - are replaced by `py_narep` - """ - plc_column = plc.strings.combine.join_list_elements( - source_strings.to_pylibcudf(mode="read"), - py_separator.device_value.c_value, - py_narep.device_value.c_value, - cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, - plc.strings.combine.SeparatorOnNulls.YES, - plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join_lists_with_column( - Column source_strings, - Column separator_strings, - object py_source_narep, - object py_separator_narep): - """ - Returns a Column by concatenating Lists of strings row-wise in - `source_strings` with a corresponding separator at the same - position in `separator_strings` and ``/`None` values in - `source_strings` are replaced by `py_source_narep` and - ``/`None` values in `separator_strings` are replaced - by `py_separator_narep` - """ - plc_column = plc.strings.combine.join_list_elements( - source_strings.to_pylibcudf(mode="read"), - separator_strings.to_pylibcudf(mode="read"), - py_separator_narep.device_value.c_value, - py_source_narep.device_value.c_value, - plc.strings.combine.SeparatorOnNulls.YES, - plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx deleted file mode 100644 index 03b4887f200..00000000000 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import contains -from pylibcudf.strings.regex_program import RegexProgram - - -@acquire_spill_lock() -def contains_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column of boolean values with True for `source_strings` - that contain regular expression `reg_ex`. - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def count_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with count of occurrences of `reg_ex` in - each string of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def match_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with each value True if the string matches `reg_ex` - regular expression with each record of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def like(Column source_strings, object py_pattern, object py_escape): - """ - Returns a Column with each value True if the string matches the - `py_pattern` like expression with each record of `source_strings` - """ - plc_column = contains.like( - source_strings.to_pylibcudf(mode="read"), - py_pattern.device_value.c_value, - py_escape.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx deleted file mode 100644 index 5bf336f4f3c..00000000000 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def extract(Column source_strings, object pattern, uint32_t flags): - """ - Returns data which contains extracted capture groups provided in - `pattern` for all `source_strings`. - The returning data contains one row for each subject string, - and one column for each group. - """ - prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) - plc_result = plc.strings.extract.extract( - source_strings.to_pylibcudf(mode="read"), prog - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx deleted file mode 100644 index 2d284d1aa9d..00000000000 --- a/python/cudf/cudf/_lib/strings/find.pyx +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def contains(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def contains_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the corresponding string in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def endswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with the pattern given in `py_target`. - """ - - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def endswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def startswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that start with the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def startswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that begin with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def find(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing lowest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - return Column.from_pylibcudf( - plc.strings.find.find( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) - - -@acquire_spill_lock() -def rfind(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing highest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - - return Column.from_pylibcudf( - plc.strings.find.rfind( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx deleted file mode 100644 index 39e0013769f..00000000000 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def find_multiple(Column source_strings, Column target_strings): - """ - Returns a column with character position values where each - of the `target_strings` are found in each string of `source_strings`. - """ - plc_result = plc.strings.find_multiple.find_multiple( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx deleted file mode 100644 index 3e7a504d535..00000000000 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def findall(Column source_strings, object pattern, uint32_t flags): - """ - Returns data with all non-overlapping matches of `pattern` - in each string of `source_strings` as a lists column. - """ - prog = plc.strings.regex_program.RegexProgram.create( - str(pattern), flags - ) - plc_result = plc.strings.findall.findall( - source_strings.to_pylibcudf(mode="read"), - prog, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def find_re(Column source_strings, object pattern, uint32_t flags): - """ - Returns character positions where the pattern first matches - the elements in source_strings. - """ - prog = plc.strings.regex_program.RegexProgram.create( - str(pattern), flags - ) - plc_result = plc.strings.findall.find_re( - source_strings.to_pylibcudf(mode="read"), - prog, - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx deleted file mode 100644 index 374a104635a..00000000000 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import pylibcudf as plc -from pylibcudf.json cimport GetJsonObjectOptions - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def get_json_object( - Column col, - object py_json_path, - GetJsonObjectOptions options -): - """ - Apply a JSONPath string to all rows in an input column - of json strings. - """ - plc_column = plc.json.get_json_object( - col.to_pylibcudf(mode="read"), - py_json_path.device_value.c_value, - options - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx deleted file mode 100644 index 015a2ebab8a..00000000000 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def pad(Column source_strings, - size_type width, - fill_char, - side=plc.strings.side_type.SideType.LEFT): - """ - Returns a Column by padding strings in `source_strings` - up to the given `width`. Direction of padding is to be specified by `side`. - The additional characters being filled can be changed by specifying - `fill_char`. - """ - plc_result = plc.strings.padding.pad( - source_strings.to_pylibcudf(mode="read"), - width, - side, - fill_char, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def zfill(Column source_strings, - size_type width): - """ - Returns a Column by prepending strings in `source_strings` - with '0' characters up to the given `width`. - """ - plc_result = plc.strings.padding.zfill( - source_strings.to_pylibcudf(mode="read"), - width - ) - return Column.from_pylibcudf(plc_result) - - -def center(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left and right side of strings - in `source_strings` with additional character, `fill_char` - up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH) - - -def ljust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling right side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT) - - -def rjust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx deleted file mode 100644 index 43649d4defe..00000000000 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def repeat_scalar(Column source_strings, - size_type repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def repeat_sequence(Column source_strings, - Column repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx deleted file mode 100644 index a260c4e4f45..00000000000 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_replace(Column source_strings, - size_type start, - size_type stop, - object py_repl): - """ - Returns a Column by replacing specified section - of each string with `py_repl`. Positions can be - specified with `start` and `stop` params. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - stop - )) - - -@acquire_spill_lock() -def insert(Column source_strings, - size_type start, - object py_repl): - """ - Returns a Column by inserting a specified - string `py_repl` at a specific position in all strings. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - start, - )) - - -@acquire_spill_lock() -def replace(Column source_strings, - object py_target, - object py_repl, - int32_t maxrepl): - """ - Returns a Column after replacing occurrences of - patterns `py_target` with `py_repl` in `source_strings`. - `maxrepl` indicates number of replacements to make from start. - """ - cdef DeviceScalar target = py_target.device_value - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace( - source_strings.to_pylibcudf(mode="read"), - target.c_value, - repl.c_value, - maxrepl - )) - - -@acquire_spill_lock() -def replace_multi(Column source_strings, - Column target_strings, - Column repl_strings): - """ - Returns a Column after replacing occurrences of - patterns `target_strings` with `repl_strings` in `source_strings`. - """ - return Column.from_pylibcudf(plc.strings.replace.replace_multiple( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read"), - repl_strings.to_pylibcudf(mode="read"), - )) diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx deleted file mode 100644 index 462d5c903e8..00000000000 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from pylibcudf.libcudf.types cimport size_type -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def replace_re(Column source_strings, - object pattern, - object py_repl, - size_type n): - """ - Returns a Column after replacing occurrences regular - expressions `pattern` with `py_repl` in `source_strings`. - `n` indicates the number of resplacements to be made from - start. (-1 indicates all) - """ - plc_column = plc.strings.replace_re.replace_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT - ), - py_repl.device_value.c_value, - n - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def replace_with_backrefs( - Column source_strings, - object pattern, - object repl): - """ - Returns a Column after using the `repl` back-ref template to create - new string with the extracted elements found using - `pattern` regular expression in `source_strings`. - """ - plc_column = plc.strings.replace_re.replace_with_backrefs( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT - ), - repl - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def replace_multi_re(Column source_strings, - list patterns, - Column repl_strings): - """ - Returns a Column after replacing occurrences of multiple - regular expressions `patterns` with their corresponding - strings in `repl_strings` in `source_strings`. - """ - plc_column = plc.strings.replace_re.replace_re( - source_strings.to_pylibcudf(mode="read"), - patterns, - repl_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx deleted file mode 100644 index 982c5a600e7..00000000000 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -import pylibcudf as plc - - -@acquire_spill_lock() -def strip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left and right side - can be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.BOTH, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def lstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left side can - be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.LEFT, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def rstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from right side can - be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.RIGHT, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx deleted file mode 100644 index db96d99c7b6..00000000000 --- a/python/cudf/cudf/_lib/strings/substring.pyx +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_strings(Column source_strings, - object start, - object end, - object step): - """ - Returns a Column by extracting a substring of each string - at given start and end positions. Slicing can also be - performed in steps by skipping `step` number of - characters in a string. - """ - cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) - - -@acquire_spill_lock() -def slice_from(Column source_strings, - Column starts, - Column stops): - """ - Returns a Column by extracting a substring of each string - at given starts and stops positions. `starts` and `stops` - here are positions per element in the string-column. - """ - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - starts.to_pylibcudf(mode="read"), - stops.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def get(Column source_strings, - object index): - """ - Returns a Column which contains only single - character from each input string. The index of - characters required can be controlled by passing `index`. - """ - - if index < 0: - next_index = index - 1 - step = -1 - else: - next_index = index + 1 - step = 1 - cdef DeviceScalar start_scalar = as_device_scalar(index, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx deleted file mode 100644 index 3ef478532c2..00000000000 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def translate(Column source_strings, - object mapping_table): - """ - Translates individual characters within each string - if present in the mapping_table. - """ - plc_result = plc.strings.translate.translate( - source_strings.to_pylibcudf(mode="read"), - mapping_table, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def filter_characters(Column source_strings, - object mapping_table, - bool keep, - object py_repl): - """ - Removes or keeps individual characters within each string - using the provided mapping_table. - """ - plc_result = plc.strings.translate.filter_characters( - source_strings.to_pylibcudf(mode="read"), - mapping_table, - plc.strings.translate.FilterType.KEEP - if keep else plc.strings.translate.FilterType.REMOVE, - py_repl.device_value.c_value - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx deleted file mode 100644 index 2b40f01f818..00000000000 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def wrap(Column source_strings, - size_type width): - """ - Returns a Column by wrapping long strings - in the Column to be formatted in paragraphs - with length less than a given `width`. - """ - plc_result = plc.strings.wrap.wrap( - source_strings.to_pylibcudf(mode="read"), - width - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f0df4a3c1b3..8ddfd4a54ae 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,11 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.null_mask import ( - MaskState, - bitmask_allocation_size_bytes, - create_null_mask, -) from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import ( apply_boolean_mask, @@ -383,7 +378,7 @@ def memory_usage(self) -> int: if self.data is not None: n += self.data.size if self.nullable: - n += bitmask_allocation_size_bytes(self.size) + n += plc.null_mask.bitmask_allocation_size_bytes(self.size) return n def _fill( @@ -410,7 +405,11 @@ def _fill( ) if not slr.is_valid() and not self.nullable: - mask = create_null_mask(self.size, state=MaskState.ALL_VALID) + mask = as_buffer( + plc.null_mask.create_null_mask( + self.size, plc.null_mask.MaskState.ALL_VALID + ) + ) self.set_base_mask(mask) libcudf.filling.fill_in_place(self, begin, end, slr.device_value) @@ -1553,7 +1552,11 @@ def column_empty( data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) if masked: - mask = create_null_mask(row_count, state=MaskState.ALL_NULL) + mask = as_buffer( + plc.null_mask.create_null_mask( + row_count, plc.null_mask.MaskState.ALL_NULL + ) + ) else: mask = None @@ -2210,7 +2213,9 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: typestr = desc["typestr"] typecode = typestr[1] if typecode == "t": - mask_size = bitmask_allocation_size_bytes(desc["shape"][0]) + mask_size = plc.null_mask.bitmask_allocation_size_bytes( + desc["shape"][0] + ) return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) elif typecode == "b": col = as_column(cai_mask) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 9962663e811..42df5123014 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -69,10 +69,7 @@ def __init__( @cached_property def memory_usage(self): - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - + n = super().memory_usage child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize current_base_child = self.base_children[1] current_offset = self.offset @@ -97,7 +94,7 @@ def memory_usage(self): ) * current_base_child.dtype.itemsize if current_base_child.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes( + n += plc.null_mask.bitmask_allocation_size_bytes( current_base_child.size ) return n diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a9ab2d373fd..d45c76d3ddb 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2,14 +2,16 @@ from __future__ import annotations +import itertools import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, cast, overload +from typing import TYPE_CHECKING, Literal, cast, overload import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import pylibcudf as plc @@ -20,22 +22,15 @@ from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import can_convert_to_column - -def str_to_boolean(column: StringColumn): - """Takes in string column and returns boolean column""" - return ( - libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") - ).fillna(False) - - if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Callable, Sequence import cupy import numba.cuda @@ -50,6 +45,16 @@ def str_to_boolean(column: StringColumn): from cudf.core.buffer import Buffer +def str_to_boolean(column: StringColumn): + """Takes in string column and returns boolean column""" + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + + _str_to_numeric_typecast_functions = { cudf.api.types.dtype("int8"): str_cast.stoi8, cudf.api.types.dtype("int16"): str_cast.stoi16, @@ -213,10 +218,12 @@ def len(self) -> SeriesOrIndex: 3 dtype: int32 """ - - return self._return_or_inplace( - libstrings.count_characters(self._column) - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def byte_count(self) -> SeriesOrIndex: """ @@ -245,9 +252,12 @@ def byte_count(self) -> SeriesOrIndex: 2 11 dtype: int32 """ - return self._return_or_inplace( - libstrings.count_bytes(self._column), - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_bytes( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) @overload def cat( @@ -347,19 +357,70 @@ def cat(self, others=None, sep=None, na_rep=None): sep = "" if others is None: - data = libstrings.join( - self._column, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_strings( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(na_rep, "str").device_value.c_value, + ) + data = Column.from_pylibcudf(plc_column) else: - other_cols = _get_cols_list(self._parent, others) - all_cols = [self._column] + other_cols - data = libstrings.concatenate( - all_cols, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), + parent_index = ( + self._parent.index + if isinstance(self._parent, cudf.Series) + else self._parent ) + if ( + can_convert_to_column(others) + and len(others) > 0 + and ( + can_convert_to_column( + others.iloc[0] + if isinstance(others, cudf.Series) + else others[0] + ) + ) + ): + other_cols = ( + column.as_column(frame.reindex(parent_index), dtype="str") + if ( + parent_index is not None + and isinstance(frame, cudf.Series) + and not frame.index.equals(parent_index) + ) + else column.as_column(frame, dtype="str") + for frame in others + ) + elif others is not None and not isinstance(others, StringMethods): + if ( + parent_index is not None + and isinstance(others, cudf.Series) + and not others.index.equals(parent_index) + ): + others = others.reindex(parent_index) + + other_cols = [column.as_column(others, dtype="str")] + else: + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.concatenate( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + [self._column], other_cols + ) + ] + ), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(na_rep, "str").device_value.c_value, + ) + data = Column.from_pylibcudf(plc_column) if len(data) == 1 and data.null_count == 1: data = cudf.core.column.as_column("", length=len(data)) @@ -516,9 +577,18 @@ def join( strings_column = self._split_by_character() if is_scalar(sep): - data = libstrings.join_lists_with_scalar( - strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep) - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_list_elements( + strings_column.to_pylibcudf(mode="read"), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(string_na_rep).device_value.c_value, + cudf._lib.scalar.DeviceScalar( + "", cudf.dtype("object") + ).c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + data = Column.from_pylibcudf(plc_column) elif can_convert_to_column(sep): sep_column = column.as_column(sep) if len(sep_column) != len(strings_column): @@ -531,13 +601,16 @@ def join( f"sep_na_rep should be a string scalar, got {sep_na_rep} " f"of type: {type(sep_na_rep)}" ) - - data = libstrings.join_lists_with_column( - strings_column, - sep_column, - cudf.Scalar(string_na_rep), - cudf.Scalar(sep_na_rep), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_list_elements( + strings_column.to_pylibcudf(mode="read"), + sep_column.to_pylibcudf(mode="read"), + cudf.Scalar(sep_na_rep).device_value.c_value, + cudf.Scalar(string_na_rep).device_value.c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + data = Column.from_pylibcudf(plc_column) else: raise TypeError( f"sep should be an str, array-like or Series object, " @@ -627,9 +700,18 @@ def extract( "unsupported value for `flags` parameter" ) - data = libstrings.extract(self._column, pat, flags) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.extract.extract( + self._column.to_pylibcudf(mode="read"), prog + ) + data = dict( + enumerate( + Column.from_pylibcudf(col) for col in plc_result.columns() + ) + ) if len(data) == 1 and expand is False: - _, data = data.popitem() + _, data = data.popitem() # type: ignore[assignment] return self._return_or_inplace(data, expand=expand) def contains( @@ -765,26 +847,41 @@ def contains( if is_scalar(pat): if regex: - result_col = libstrings.contains_re(self._column, pat, flags) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create( + pat, flags + ) + plc_result = plc.strings.contains.contains_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result_col = Column.from_pylibcudf(plc_result) else: if case is False: - input_column = libstrings.to_lower(self._column) - pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore + input_column = self.lower()._column # type: ignore[union-attr] + plc_pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore[union-attr] else: input_column = self._column - pat = cudf.Scalar(pat, dtype="str") # type: ignore - result_col = libstrings.contains(input_column, pat) + plc_pat = cudf.Scalar(pat, dtype="str") + with acquire_spill_lock(): + plc_result = plc.strings.find.contains( + input_column.to_pylibcudf(mode="read"), + plc_pat.device_value.c_value, + ) + result_col = Column.from_pylibcudf(plc_result) else: # TODO: we silently ignore the `regex=` flag here if case is False: - input_column = libstrings.to_lower(self._column) - col_pat = libstrings.to_lower( - column.as_column(pat, dtype="str") - ) + input_column = self.lower()._column # type: ignore[union-attr] + col_pat = cudf.Index(pat, dtype="str").str.lower()._column # type: ignore[union-attr] else: input_column = self._column col_pat = column.as_column(pat, dtype="str") - result_col = libstrings.contains_multiple(input_column, col_pat) + with acquire_spill_lock(): + plc_result = plc.strings.find.contains( + input_column.to_pylibcudf(mode="read"), + col_pat.to_pylibcudf(mode="read"), + ) + result_col = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result_col) def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: @@ -850,11 +947,15 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: "expected esc to contain less than or equal to 1 characters" ) - result_col = libstrings.like( - self._column, cudf.Scalar(pat, "str"), cudf.Scalar(esc, "str") - ) + with acquire_spill_lock(): + plc_result = plc.strings.contains.like( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(pat, "str").device_value.c_value, + cudf.Scalar(esc, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) - return self._return_or_inplace(result_col) + return self._return_or_inplace(result) def repeat( self, @@ -901,17 +1002,16 @@ def repeat( 2 ccc dtype: object """ - if can_convert_to_column(repeats): - return self._return_or_inplace( - libstrings.repeat_sequence( - self._column, - column.as_column(repeats, dtype="int"), - ), + with acquire_spill_lock(): + if can_convert_to_column(repeats): + repeats = column.as_column(repeats, dtype="int").to_pylibcudf( + mode="read" + ) + plc_result = plc.strings.repeat.repeat_strings( + self._column.to_pylibcudf(mode="read"), repeats ) - - return self._return_or_inplace( - libstrings.repeat_scalar(self._column, repeats) - ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def replace( self, @@ -997,19 +1097,22 @@ def replace( "`pat` and `repl` are list-like inputs" ) - return self._return_or_inplace( - libstrings.replace_multi_re( - self._column, - list(pat), - column.as_column(repl, dtype="str"), + if regex: + with acquire_spill_lock(): + plc_result = plc.strings.replace_re.replace_re( + self._column.to_pylibcudf(mode="read"), + list(pat), + column.as_column(repl, dtype="str").to_pylibcudf( + mode="read" + ), + ) + result = Column.from_pylibcudf(plc_result) + else: + result = self._column.replace_multiple( + cast(StringColumn, column.as_column(pat, dtype="str")), + cast(StringColumn, column.as_column(repl, dtype="str")), ) - if regex - else libstrings.replace_multi( - self._column, - column.as_column(pat, dtype="str"), - column.as_column(repl, dtype="str"), - ), - ) + return self._return_or_inplace(result) # Pandas treats 0 as all if n == 0: n = -1 @@ -1019,18 +1122,25 @@ def replace( pat = pat.pattern # Pandas forces non-regex replace when pat is a single-character - return self._return_or_inplace( - libstrings.replace_re( - self._column, pat, cudf.Scalar(repl, "str"), n - ) - if regex is True and len(pat) > 1 - else libstrings.replace( - self._column, - cudf.Scalar(pat, "str"), - cudf.Scalar(repl, "str"), - n, - ), - ) + with acquire_spill_lock(): + if regex is True and len(pat) > 1: + plc_result = plc.strings.replace_re.replace_re( + self._column.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + cudf.Scalar(repl, "str").device_value.c_value, + n, + ) + else: + plc_result = plc.strings.replace.replace( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(pat).device_value.c_value, + cudf.Scalar(repl).device_value.c_value, + n, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: r""" @@ -1058,14 +1168,20 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: 1 ZV576 dtype: object """ - # If 'pat' is re.Pattern then get the pattern string from it if isinstance(pat, re.Pattern): pat = pat.pattern - return self._return_or_inplace( - libstrings.replace_with_backrefs(self._column, pat, repl) - ) + with acquire_spill_lock(): + plc_result = plc.strings.replace_re.replace_with_backrefs( + self._column.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + repl, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def slice( self, @@ -1136,10 +1252,28 @@ def slice( 2 cm dtype: object """ + param_dtype = np.dtype(np.int32) + with acquire_spill_lock(): + plc_result = plc.strings.slice.slice_strings( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(start, param_dtype).device_value.c_value, + cudf.Scalar(stop, param_dtype).device_value.c_value, + cudf.Scalar(step, param_dtype).device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) - return self._return_or_inplace( - libstrings.slice_strings(self._column, start, stop, step), - ) + def _all_characters_of_type( + self, + char_type: plc.strings.char_types.StringCharacterTypes, + case_type: plc.strings.char_types.StringCharacterTypes = plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) -> SeriesOrIndex: + with acquire_spill_lock(): + plc_column = plc.strings.char_types.all_characters_of_type( + self._column.to_pylibcudf(mode="read"), char_type, case_type + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def isinteger(self) -> SeriesOrIndex: """ @@ -1396,7 +1530,9 @@ def isdecimal(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_decimal(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.DECIMAL + ) def isalnum(self) -> SeriesOrIndex: """ @@ -1467,7 +1603,9 @@ def isalnum(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_alnum(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.ALPHANUM + ) def isalpha(self) -> SeriesOrIndex: """ @@ -1525,7 +1663,9 @@ def isalpha(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_alpha(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.ALPHA + ) def isdigit(self) -> SeriesOrIndex: """ @@ -1589,7 +1729,9 @@ def isdigit(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_digit(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.DIGIT + ) def isnumeric(self) -> SeriesOrIndex: """ @@ -1659,7 +1801,9 @@ def isnumeric(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_numeric(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.NUMERIC + ) def isupper(self) -> SeriesOrIndex: """ @@ -1718,7 +1862,10 @@ def isupper(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_upper(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.UPPER, + plc.strings.char_types.StringCharacterTypes.CASE_TYPES, + ) def islower(self) -> SeriesOrIndex: """ @@ -1777,7 +1924,10 @@ def islower(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_lower(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.LOWER, + plc.strings.char_types.StringCharacterTypes.CASE_TYPES, + ) def isipv4(self) -> SeriesOrIndex: """ @@ -1844,7 +1994,7 @@ def lower(self) -> SeriesOrIndex: 3 swapcase dtype: object """ - return self._return_or_inplace(libstrings.to_lower(self._column)) + return self._return_or_inplace(self._column.to_lower()) def upper(self) -> SeriesOrIndex: """ @@ -1895,7 +2045,7 @@ def upper(self) -> SeriesOrIndex: 3 SWAPCASE dtype: object """ - return self._return_or_inplace(libstrings.to_upper(self._column)) + return self._return_or_inplace(self._column.to_upper()) def capitalize(self) -> SeriesOrIndex: """ @@ -1923,7 +2073,7 @@ def capitalize(self) -> SeriesOrIndex: 1 Goodbye, friend dtype: object """ - return self._return_or_inplace(libstrings.capitalize(self._column)) + return self._return_or_inplace(self._column.capitalize()) def swapcase(self) -> SeriesOrIndex: """ @@ -1970,7 +2120,7 @@ def swapcase(self) -> SeriesOrIndex: 3 sWaPcAsE dtype: object """ - return self._return_or_inplace(libstrings.swapcase(self._column)) + return self._return_or_inplace(self._column.swapcase()) def title(self) -> SeriesOrIndex: """ @@ -2017,7 +2167,7 @@ def title(self) -> SeriesOrIndex: 3 Swapcase dtype: object """ - return self._return_or_inplace(libstrings.title(self._column)) + return self._return_or_inplace(self._column.title()) def istitle(self) -> SeriesOrIndex: """ @@ -2043,7 +2193,7 @@ def istitle(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_title(self._column)) + return self._return_or_inplace(self._column.is_title()) def filter_alphanum( self, repl: str | None = None, keep: bool = True @@ -2078,14 +2228,22 @@ def filter_alphanum( if repl is None: repl = "" - return self._return_or_inplace( - libstrings.filter_alphanum( - self._column, cudf.Scalar(repl, "str"), keep - ), - ) + with acquire_spill_lock(): + plc_column = plc.strings.char_types.filter_characters_of_type( + self._column.to_pylibcudf(mode="read"), + plc.strings.char_types.StringCharacterTypes.ALL_TYPES + if keep + else plc.strings.char_types.StringCharacterTypes.ALPHANUM, + cudf.Scalar(repl, "str").device_value.c_value, + plc.strings.char_types.StringCharacterTypes.ALPHANUM + if keep + else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def slice_from( - self, starts: "cudf.Series", stops: "cudf.Series" + self, starts: cudf.Series, stops: cudf.Series ) -> SeriesOrIndex: """ Return substring of each string using positions for each string. @@ -2122,14 +2280,14 @@ def slice_from( 1 re dtype: object """ - - return self._return_or_inplace( - libstrings.slice_from( - self._column, - column.as_column(starts), - column.as_column(stops), - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.slice.slice_strings( + self._column.to_pylibcudf(mode="read"), + starts._column.to_pylibcudf(mode="read"), + stops._column.to_pylibcudf(mode="read"), + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def slice_replace( self, @@ -2217,11 +2375,15 @@ def slice_replace( if repl is None: repl = "" - return self._return_or_inplace( - libstrings.slice_replace( - self._column, start, stop, cudf.Scalar(repl, "str") - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.replace.replace_slice( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(repl, "str").device_value.c_value, + start, + stop, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: """ @@ -2266,12 +2428,7 @@ def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: 1 0123456789_ dtype: object """ - if repl is None: - repl = "" - - return self._return_or_inplace( - libstrings.insert(self._column, start, cudf.Scalar(repl, "str")), - ) + return self.slice_replace(start, start, repl) def get(self, i: int = 0) -> SeriesOrIndex: """ @@ -2314,17 +2471,22 @@ def get(self, i: int = 0) -> SeriesOrIndex: 2 f dtype: object """ - - return self._return_or_inplace(libstrings.get(self._column, i)) + if i < 0: + next_index = i - 1 + step = -1 + else: + next_index = i + 1 + step = 1 + return self.slice(i, next_index, step) def get_json_object( self, - json_path, + json_path: str, *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False, - ): + allow_single_quotes: bool = False, + strip_quotes_from_single_strings: bool = True, + missing_fields_as_nulls: bool = False, + ) -> SeriesOrIndex: r""" Applies a JSONPath string to an input strings column where each row in the column is a valid json string @@ -2394,11 +2556,14 @@ def get_json_object( ), missing_fields_as_nulls=missing_fields_as_nulls, ) - return self._return_or_inplace( - libstrings.get_json_object( - self._column, cudf.Scalar(json_path, "str"), options + with acquire_spill_lock(): + plc_result = plc.json.get_json_object( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(json_path, "str").device_value.c_value, + options, ) - ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def split( self, @@ -2893,7 +3058,10 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: ) def pad( - self, width: int, side: str = "left", fillchar: str = " " + self, + width: int, + side: Literal["left", "both", "right"] = "left", + fillchar: str = " ", ) -> SeriesOrIndex: """ Pad strings in the Series/Index up to width. @@ -2974,10 +3142,15 @@ def pad( raise ValueError( "side has to be either one of {'left', 'right', 'both'}" ) - - return self._return_or_inplace( - libstrings.pad(self._column, width, fillchar, side) - ) + with acquire_spill_lock(): + plc_result = plc.strings.padding.pad( + self._column.to_pylibcudf(mode="read"), + width, + side, + fillchar, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def zfill(self, width: int) -> SeriesOrIndex: """ @@ -3043,7 +3216,12 @@ def zfill(self, width: int) -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(libstrings.zfill(self._column, width)) + with acquire_spill_lock(): + plc_result = plc.strings.padding.zfill( + self._column.to_pylibcudf(mode="read"), width + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3100,22 +3278,7 @@ def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 --d--- dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.center(self._column, width, fillchar) - ) + return self.pad(width, "both", fillchar) def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3154,22 +3317,7 @@ def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 __ dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.ljust(self._column, width, fillchar) - ) + return self.pad(width, "right", fillchar) def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3208,22 +3356,21 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 __ dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) + return self.pad(width, "left", fillchar) - return self._return_or_inplace( - libstrings.rjust(self._column, width, fillchar) - ) + def _strip( + self, side: plc.string.side_type.SideType, to_strip: str | None = None + ) -> SeriesOrIndex: + if to_strip is None: + to_strip = "" + with acquire_spill_lock(): + plc_result = plc.strings.strip.strip( + self._column.to_pylibcudf(mode="read"), + side, + cudf.Scalar(to_strip, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def strip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3277,12 +3424,7 @@ def strip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.BOTH, to_strip) def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3324,12 +3466,7 @@ def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.LEFT, to_strip) def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3379,12 +3516,7 @@ def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.rstrip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.RIGHT, to_strip) def wrap(self, width: int, **kwargs) -> SeriesOrIndex: r""" @@ -3478,7 +3610,12 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: "`break_on_hyphens`=False" ) - return self._return_or_inplace(libstrings.wrap(self._column, width)) + with acquire_spill_lock(): + plc_result = plc.strings.wrap.wrap( + self._column.to_pylibcudf(mode="read"), width + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: r""" @@ -3546,10 +3683,37 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: raise NotImplementedError( "unsupported value for `flags` parameter" ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.contains.count_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) - return self._return_or_inplace( - libstrings.count_re(self._column, pat, flags) - ) + def _findall( + self, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram], plc.Column + ], + pat: str | re.Pattern, + flags: int = 0, + ) -> SeriesOrIndex: + if isinstance(pat, re.Pattern): + flags = pat.flags & ~re.U + pat = pat.pattern + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = method( + self._column.to_pylibcudf(mode="read"), + prog, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3616,16 +3780,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. """ - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - - data = libstrings.findall(self._column, pat, flags) - return self._return_or_inplace(data) + return self._findall(plc.strings.findall.findall, pat, flags) def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3656,16 +3811,7 @@ def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: 3 2 dtype: int32 """ - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "Unsupported value for `flags` parameter" - ) - - data = libstrings.find_re(self._column, pat, flags) - return self._return_or_inplace(data) + return self._findall(plc.strings.findall.find_re, pat, flags) def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ @@ -3723,8 +3869,15 @@ def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: f"got: {patterns_column.dtype}" ) + with acquire_spill_lock(): + plc_result = plc.strings.find_multiple.find_multiple( + self._column.to_pylibcudf(mode="read"), + patterns_column.to_pylibcudf(mode="read"), + ) + result = Column.from_pylibcudf(plc_result) + return cudf.Series._from_column( - libstrings.find_multiple(self._column, patterns_column), + result, name=self._parent.name, index=self._parent.index if isinstance(self._parent, cudf.Series) @@ -3816,9 +3969,34 @@ def isspace(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_space(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.SPACE + ) - def endswith(self, pat: str) -> SeriesOrIndex: + def _starts_ends_with( + self, + method: Callable[[plc.Column, plc.Column | plc.Scalar], plc.Column], + pat: str | Sequence, + ) -> SeriesOrIndex: + if pat is None: + raise TypeError( + f"expected a string or a sequence-like object, not " + f"{type(pat).__name__}" + ) + elif is_scalar(pat): + plc_pat = cudf.Scalar(pat, "str").device_value.c_value + else: + plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( + mode="read" + ) + with acquire_spill_lock(): + plc_result = method( + self._column.to_pylibcudf(mode="read"), plc_pat + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) + + def endswith(self, pat: str | Sequence) -> SeriesOrIndex: """ Test if the end of each string element matches a pattern. @@ -3860,21 +4038,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: `na` parameter is not yet supported, as cudf uses native strings instead of Python objects. """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.endswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.endswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) + return self._starts_ends_with(plc.strings.find.ends_with, pat) def startswith(self, pat: str | Sequence) -> SeriesOrIndex: """ @@ -3923,21 +4087,7 @@ def startswith(self, pat: str | Sequence) -> SeriesOrIndex: 3 dtype: bool """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.startswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.startswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) + return self._starts_ends_with(plc.strings.find.starts_with, pat) def removesuffix(self, suffix: str) -> SeriesOrIndex: """ @@ -3972,12 +4122,9 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex: """ if suffix is None or len(suffix) == 0: return self._return_or_inplace(self._column) - ends_column = libstrings.endswith( - self._column, cudf.Scalar(suffix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, 0, -len(suffix), None - ) + ends_column = self.endswith(suffix)._column # type: ignore[union-attr] + removed_column = self.slice(0, -len(suffix), None)._column # type: ignore[union-attr] + result = cudf._lib.copying.copy_if_else( removed_column, self._column, ends_column ) @@ -4016,17 +4163,38 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: """ if prefix is None or len(prefix) == 0: return self._return_or_inplace(self._column) - starts_column = libstrings.startswith( - self._column, cudf.Scalar(prefix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, len(prefix), None, None - ) + starts_column = self.startswith(prefix)._column # type: ignore[union-attr] + removed_column = self.slice(len(prefix), None, None)._column # type: ignore[union-attr] result = cudf._lib.copying.copy_if_else( removed_column, self._column, starts_column ) return self._return_or_inplace(result) + def _find( + self, + method: Callable[[plc.Column, plc.Scalar, int, int], plc.Column], + sub: str, + start: int = 0, + end: int | None = None, + ) -> SeriesOrIndex: + if not isinstance(sub, str): + raise TypeError( + f"expected a string object, not {type(sub).__name__}" + ) + + if end is None: + end = -1 + + with acquire_spill_lock(): + plc_result = method( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(sub, "str").device_value.c_value, + start, + end, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) + def find( self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: @@ -4070,19 +4238,7 @@ def find( 3 2 dtype: int32 """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) + return self._find(plc.strings.find.find, sub, start, end) def rfind( self, sub: str, start: int = 0, end: int | None = None @@ -4131,19 +4287,7 @@ def rfind( 2 -1 dtype: int32 """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) + return self._find(plc.strings.find.rfind, sub, start, end) def index( self, sub: str, start: int = 0, end: int | None = None @@ -4196,9 +4340,7 @@ def index( if end is None: end = -1 - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = self.find(sub, start, end)._column # type: ignore[union-attr] result = self._return_or_inplace(result_col) @@ -4258,9 +4400,7 @@ def rindex( if end is None: end = -1 - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = self.rfind(sub, start, end)._column # type: ignore[union-attr] result = self._return_or_inplace(result_col) @@ -4323,10 +4463,13 @@ def match( raise NotImplementedError( "unsupported value for `flags` parameter" ) - - return self._return_or_inplace( - libstrings.match_re(self._column, pat, flags) - ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.contains.matches_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def url_decode(self) -> SeriesOrIndex: """ @@ -4420,9 +4563,12 @@ def code_points(self) -> SeriesOrIndex: 2 99 dtype: int32 """ - return self._return_or_inplace( - libstrings.code_points(self._column), retain_index=False - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.code_points( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result, retain_index=False) def translate(self, table: dict) -> SeriesOrIndex: """ @@ -4465,9 +4611,12 @@ def translate(self, table: dict) -> SeriesOrIndex: dtype: object """ table = str.maketrans(table) - return self._return_or_inplace( - libstrings.translate(self._column, table) - ) + with acquire_spill_lock(): + plc_result = plc.strings.translate.translate( + self._column.to_pylibcudf(mode="read"), table + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def filter_characters( self, table: dict, keep: bool = True, repl: str | None = None @@ -4516,11 +4665,17 @@ def filter_characters( if repl is None: repl = "" table = str.maketrans(table) - return self._return_or_inplace( - libstrings.filter_characters( - self._column, table, keep, cudf.Scalar(repl, "str") - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.translate.filter_characters( + self._column.to_pylibcudf(mode="read"), + table, + plc.strings.translate.FilterType.KEEP + if keep + else plc.strings.translate.FilterType.REMOVE, + cudf.Scalar(repl, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def normalize_spaces(self) -> SeriesOrIndex: r""" @@ -5614,17 +5769,12 @@ def _massage_string_arg(value, name, allow_col=False): allowed_types.append("Column") - raise ValueError( - f"Expected {_expected_types_format(allowed_types)} " - f"for {name} but got {type(value)}" - ) - - -def _expected_types_format(types): - if len(types) == 1: - return types[0] + if len(allowed_types) == 1: + expected = allowed_types[0] + else: + expected = ", ".join(allowed_types[:-1]) + ", or " + allowed_types[-1] - return ", ".join(types[:-1]) + ", or " + types[-1] + raise ValueError(f"Expected {expected} for {name} but got {type(value)}") class StringColumn(column.ColumnBase): @@ -5750,17 +5900,13 @@ def end_offset(self) -> int: @cached_property def memory_usage(self) -> int: - n = 0 - if self.data is not None: - n += self.data.size + n = super().memory_usage if len(self.base_children) == 1: child0_size = (self.size + 1) * self.base_children[ 0 ].dtype.itemsize n += child0_size - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) return n @property @@ -5848,11 +5994,13 @@ def sum( skipna=skipna, min_count=min_count ) if isinstance(result_col, type(self)): - return libstrings.join( - result_col, - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ).element_indexing(0) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_strings( + result_col.to_pylibcudf(mode="read"), + cudf.Scalar("").device_value.c_value, + cudf.Scalar(None, "str").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column).element_indexing(0) else: return result_col @@ -5901,13 +6049,12 @@ def strptime( ) is_nat = self == "NaT" without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) - all_same_length = ( - libstrings.count_characters(without_nat).distinct_count( - dropna=True + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + without_nat.to_pylibcudf(mode="read") ) - == 1 - ) - if not all_same_length: + char_counts = Column.from_pylibcudf(plc_column) + if char_counts.distinct_count(dropna=True) != 1: # Unfortunately disables OK cases like: # ["2020-01-01", "2020-01-01 00:00:00"] # But currently incorrect for cases like (drops 10): @@ -6108,14 +6255,18 @@ def _binaryop( rhs: cudf.Scalar | StringColumn lhs, rhs = (other, self) if reflect else (self, other) - return cast( - "column.ColumnBase", - libstrings.concatenate( - [lhs, rhs], - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.concatenate( + plc.Table( + [ + lhs.to_pylibcudf(mode="read"), + rhs.to_pylibcudf(mode="read"), + ] + ), + cudf.Scalar("").device_value.c_value, + cudf.Scalar(None, "str").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column) elif op in { "__eq__", "__ne__", @@ -6155,52 +6306,39 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) - -def _get_cols_list(parent_obj, others): - parent_index = ( - parent_obj.index if isinstance(parent_obj, cudf.Series) else parent_obj - ) - - if ( - can_convert_to_column(others) - and len(others) > 0 - and ( - can_convert_to_column( - others.iloc[0] - if isinstance(others, cudf.Series) - else others[0] - ) - ) - ): + def _modify_characters( + self, method: Callable[[plc.Column], plc.Column] + ) -> Self: """ - If others is a list-like object (in our case lists & tuples) - just another Series/Index, great go ahead with concatenation. + Helper function for methods that modify characters e.g. to_lower """ - cols_list = [ - column.as_column(frame.reindex(parent_index), dtype="str") - if ( - parent_index is not None - and isinstance(frame, cudf.Series) - and not frame.index.equals(parent_index) - ) - else column.as_column(frame, dtype="str") - for frame in others - ] + with acquire_spill_lock(): + plc_column = method(self.to_pylibcudf(mode="read")) + return cast(Self, Column.from_pylibcudf(plc_column)) - return cols_list - elif others is not None and not isinstance(others, StringMethods): - if ( - parent_index is not None - and isinstance(others, cudf.Series) - and not others.index.equals(parent_index) - ): - others = others.reindex(parent_index) + def to_lower(self) -> Self: + return self._modify_characters(plc.strings.case.to_lower) - return [column.as_column(others, dtype="str")] - else: - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarrary " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) + def to_upper(self) -> Self: + return self._modify_characters(plc.strings.case.to_upper) + + def capitalize(self) -> Self: + return self._modify_characters(plc.strings.capitalize.capitalize) + + def swapcase(self) -> Self: + return self._modify_characters(plc.strings.case.swapcase) + + def title(self) -> Self: + return self._modify_characters(plc.strings.capitalize.title) + + def is_title(self) -> Self: + return self._modify_characters(plc.strings.capitalize.is_title) + + def replace_multiple(self, pattern: Self, replacements: Self) -> Self: + with acquire_spill_lock(): + plc_result = plc.strings.replace.replace_multiple( + self.to_pylibcudf(mode="read"), + pattern.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + ) + return cast(Self, Column.from_pylibcudf(plc_result)) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 8f16ba4e15b..2adc6b54bab 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -101,10 +101,7 @@ def to_pandas( @cached_property def memory_usage(self) -> int: - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - + n = super().memory_usage for child in self.children: n += child.memory_usage diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 73c0af45293..b58ab13be93 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -45,7 +45,7 @@ from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable -from cudf.core.buffer import acquire_spill_lock +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -3191,9 +3191,10 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): out.append(result._with_type_metadata(col.dtype)) else: - out_mask = cudf._lib.null_mask.create_null_mask( - len(source_col), - state=cudf._lib.null_mask.MaskState.ALL_NULL, + out_mask = as_buffer( + plc.null_mask.create_null_mask( + len(source_col), plc.null_mask.MaskState.ALL_NULL + ) ) out.append(source_col.set_mask(out_mask)) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 315324c130c..e977f037b79 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby -from cudf._lib.null_mask import bitmask_or from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default @@ -1118,8 +1117,7 @@ def ngroup(self, ascending=True): """ index = self.grouping.keys.unique().sort_values() num_groups = len(index) - _, has_null_group = bitmask_or([*index._columns]) - + has_null_group = any(col.has_nulls() for col in index._columns) if ascending: # Count ascending from 0 to num_groups - 1 groups = range(num_groups) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 9a22045ff78..91f23490031 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -242,12 +242,11 @@ def _convert_str_col(col, errors, _downcast=None): def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: """Handles empty and infinity strings""" - col = libstrings.to_lower(col) + col = col.to_lower() # type: ignore[attr-defined] col = col.find_and_replace(as_column([""]), as_column(["NaN"])) # TODO: This can be handled by libcudf in # future see StringColumn.as_numerical_column - col = libstrings.replace_multi( - col, + col = col.replace_multiple( # type: ignore[attr-defined] as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index a5dc8a5498c..6624a1a150e 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -15,8 +15,9 @@ from numba.cuda.cudadecl import registry as cuda_decl_registry from numba.cuda.cudaimpl import lower as cuda_lower +import pylibcudf as plc + import cudf -from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string from cudf.core.udf.strings_typing import StringView, string_view, udf_string @@ -91,7 +92,7 @@ def random_bitmask(size): size : int number of bits """ - sz = bitmask_allocation_size_bytes(size) + sz = plc.null_mask.bitmask_allocation_size_bytes(size) rng = np.random.default_rng(seed=0) data = rng.integers(0, 255, dtype="u1", size=sz) return data.view("i1") diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 47976fc4bac..b48be6b2c2f 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -277,6 +277,12 @@ def test_cudf_json_writer_read(gdf_writer_types): """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""", """{"a":{"L": [{}, {}]}, "b":1.1}\n""", ), + # empty structs + ("""{"A": null}\n {"A": {}}\n {}""", """{}\n{"A":{}}\n{}\n"""), + ( + """{"A": {"B": null}}\n {"A": {"B": {}}}\n {"A": {}}""", + """{"A":{}}\n{"A":{"B":{}}}\n{"A":{}}\n""", + ), ], ) def test_cudf_json_roundtrip(jsonl_string, expected): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index e25f99d7bee..9700f548a16 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1272,7 +1272,7 @@ def test_string_slice_from(): gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) - got = gs.str.slice_from(starts=d_starts._column, stops=d_stops._column) + got = gs.str.slice_from(starts=d_starts, stops=d_stops) expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) assert_eq(got, expected) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 294253cd119..e6d252b8807 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +import pylibcudf as plc import rmm import cudf @@ -252,7 +253,7 @@ def pa_mask_buffer_to_mask(mask_buf, size): """ Convert PyArrow mask buffer to cuDF mask buffer """ - mask_size = cudf._lib.null_mask.bitmask_allocation_size_bytes(size) + mask_size = plc.null_mask.bitmask_allocation_size_bytes(size) if mask_buf.size < mask_size: dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(np.asarray(mask_buf).view("u1")) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 6617b71be81..e8d9691f2a0 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -476,23 +476,28 @@ def do_evaluate( with path.open() as f: while f.readline() == "\n": skiprows += 1 - tbl_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([path]), - delimiter=sep, - quotechar=quote, - lineterminator=eol, - col_names=column_names, - header=header, - usecols=usecols, - na_filter=True, - na_values=null_values, - keep_default_na=False, - skiprows=skiprows, - comment=comment, - decimal=decimal, - dtypes=schema, - nrows=n_rows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path])) + .nrows(n_rows) + .skiprows(skiprows) + .lineterminator(str(eol)) + .quotechar(str(quote)) + .decimal(decimal) + .keep_default_na(keep_default_na=False) + .na_filter(na_filter=True) + .build() ) + options.set_delimiter(str(sep)) + if column_names is not None: + options.set_names([str(name) for name in column_names]) + options.set_header(header) + options.set_dtypes(schema) + if usecols is not None: + options.set_use_cols_names([str(name) for name in usecols]) + options.set_na_values(null_values) + if comment is not None: + options.set_comment(comment) + tbl_w_meta = plc.io.csv.read_csv(options) pieces.append(tbl_w_meta) if read_partial: n_rows -= tbl_w_meta.tbl.num_rows() diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd index f04edaa316a..95f3ff4fe45 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -6,11 +6,63 @@ from libcpp cimport bool from pylibcudf.libcudf.io.csv cimport ( csv_writer_options, csv_writer_options_builder, + csv_reader_options, + csv_reader_options_builder, ) -from pylibcudf.libcudf.io.types cimport quote_style -from pylibcudf.io.types cimport SinkInfo +from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata from pylibcudf.table cimport Table +from pylibcudf.libcudf.io.types cimport ( + compression_type, + quote_style, + table_with_metadata, +) +from pylibcudf.libcudf.types cimport size_type + +cdef class CsvReaderOptions: + cdef csv_reader_options c_obj + cdef SourceInfo source + cpdef void set_header(self, size_type header) + cpdef void set_names(self, list col_names) + cpdef void set_prefix(self, str prefix) + cpdef void set_use_cols_indexes(self, list col_indices) + cpdef void set_use_cols_names(self, list col_names) + cpdef void set_delimiter(self, str delimiter) + cpdef void set_thousands(self, str thousands) + cpdef void set_comment(self, str comment) + cpdef void set_parse_dates(self, list val) + cpdef void set_parse_hex(self, list val) + cpdef void set_dtypes(self, object types) + cpdef void set_true_values(self, list true_values) + cpdef void set_false_values(self, list false_values) + cpdef void set_na_values(self, list na_values) + + +cdef class CsvReaderOptionsBuilder: + cdef csv_reader_options_builder c_obj + cdef SourceInfo source + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression) + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols) + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows) + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows) + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter) + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting) + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator) + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar) + cpdef CsvReaderOptionsBuilder decimal(self, str decimal) + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace) + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace) + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines) + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote) + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na) + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter) + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst) + cpdef CsvReaderOptions build(self) + +cpdef TableWithMetadata read_csv(CsvReaderOptions options) + cdef class CsvWriterOptions: cdef csv_writer_options c_obj cdef Table table diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi index 583b66bc29c..540cbc778ea 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyi +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -2,6 +2,8 @@ from collections.abc import Mapping +from typing_extensions import Self + from pylibcudf.io.types import ( CompressionType, QuoteStyle, @@ -12,6 +14,47 @@ from pylibcudf.io.types import ( from pylibcudf.table import Table from pylibcudf.types import DataType +class CsvReaderOptions: + def __init__(self): ... + def set_header(self, header: int): ... + def set_names(self, col_names: list[str]): ... + def set_prefix(self, prefix: str): ... + def set_use_cols_indexes(self, col_indices: list[int]): ... + def set_use_cols_names(self, col_names: list[str]): ... + def set_delimiter(self, delimiter: str): ... + def set_thousands(self, thousands: str): ... + def set_comment(self, comment: str): ... + def set_parse_dates(self, val: list[int | str]): ... + def set_parse_hex(self, val: list[int | str]): ... + def set_dtypes(self, types: dict[str, DataType] | list[DataType]): ... + def set_true_values(self, true_values: list[str]): ... + def set_false_values(self, false_values: list[str]): ... + def set_na_values(self, na_values: list[str]): ... + @staticmethod + def builder(source: SourceInfo) -> CsvReaderOptionsBuilder: ... + +class CsvReaderOptionsBuilder: + def __init__(self): ... + def compression(self, compression: CompressionType) -> Self: ... + def mangle_dupe_cols(self, mangle_dupe_cols: bool) -> Self: ... + def byte_range_offset(self, byte_range_offset: int) -> Self: ... + def byte_range_size(self, byte_range_size: int) -> Self: ... + def nrows(self, nrows: int) -> Self: ... + def skiprows(self, skiprows: int) -> Self: ... + def skipfooter(self, skipfooter: int) -> Self: ... + def quoting(self, quoting: QuoteStyle) -> Self: ... + def lineterminator(self, lineterminator: str) -> Self: ... + def quotechar(self, quotechar: str) -> Self: ... + def decimal(self, decimal: str) -> Self: ... + def delim_whitespace(self, delim_whitespace: bool) -> Self: ... + def skipinitialspace(self, skipinitialspace: bool) -> Self: ... + def skip_blank_lines(self, skip_blank_lines: bool) -> Self: ... + def doublequote(self, doublequote: bool) -> Self: ... + def keep_default_na(self, keep_default_na: bool) -> Self: ... + def na_filter(self, na_filter: bool) -> Self: ... + def dayfirst(self, dayfirst: bool) -> Self: ... + def build(self) -> CsvReaderOptions: ... + def read_csv( source_info: SourceInfo, *, @@ -54,7 +97,7 @@ def read_csv( # detect_whitespace_around_quotes: bool = False, # timestamp_type: DataType = DataType(type_id.EMPTY), ) -> TableWithMetadata: ... -def write_csv(options: CsvWriterOptionsBuilder) -> None: ... +def write_csv(options: CsvWriterOptionsBuilder): ... class CsvWriterOptions: def __init__(self): ... @@ -63,14 +106,12 @@ class CsvWriterOptions: class CsvWriterOptionsBuilder: def __init__(self): ... - def names(self, names: list) -> CsvWriterOptionsBuilder: ... - def na_rep(self, val: str) -> CsvWriterOptionsBuilder: ... - def include_header(self, val: bool) -> CsvWriterOptionsBuilder: ... - def rows_per_chunk(self, val: int) -> CsvWriterOptionsBuilder: ... - def line_terminator(self, term: str) -> CsvWriterOptionsBuilder: ... - def inter_column_delimiter( - self, delim: str - ) -> CsvWriterOptionsBuilder: ... - def true_value(self, val: str) -> CsvWriterOptionsBuilder: ... - def false_value(self, val: str) -> CsvWriterOptionsBuilder: ... + def names(self, names: list) -> Self: ... + def na_rep(self, val: str) -> Self: ... + def include_header(self, val: bool) -> Self: ... + def rows_per_chunk(self, val: int) -> Self: ... + def line_terminator(self, term: str) -> Self: ... + def inter_column_delimiter(self, delim: str) -> Self: ... + def true_value(self, val: str) -> Self: ... + def false_value(self, val: str) -> Self: ... def build(self) -> CsvWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 8be391de2c2..efc9bb813a1 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -28,252 +28,628 @@ __all__ = [ "write_csv", "CsvWriterOptions", "CsvWriterOptionsBuilder", + "CsvReaderOptions", + "CsvReaderOptionsBuilder", ] -cdef tuple _process_parse_dates_hex(list cols): - cdef vector[string] str_cols - cdef vector[int] int_cols - for col in cols: - if isinstance(col, str): - str_cols.push_back(col.encode()) +cdef class CsvReaderOptions: + """The settings to use for ``read_csv`` + For details, see :cpp:class:`cudf::io::csv_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a CsvWriterOptionsBuilder object + + For details, see :cpp:func:`cudf::io::csv_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the CSV file from. + + Returns + ------- + CsvReaderOptionsBuilder + Builder to build CsvReaderOptions + """ + cdef CsvReaderOptionsBuilder csv_builder = CsvReaderOptionsBuilder.__new__( + CsvReaderOptionsBuilder + ) + csv_builder.c_obj = csv_reader_options.builder(source.c_obj) + csv_builder.source = source + return csv_builder + + cpdef void set_header(self, size_type header): + """ + Sets header row index. + + Parameters + ---------- + header : size_type + Index where header row is located + + Returns + ------- + None + """ + self.c_obj.set_header(header) + + cpdef void set_names(self, list col_names): + """ + Sets names of the column. + + Parameters + ---------- + col_names : list[str] + List of column names + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_names(vec) + + cpdef void set_prefix(self, str prefix): + """ + Sets prefix to be used for column ID. + + Parameters + ---------- + prefix : str + String used as prefix in for each column name + + Returns + ------- + None + """ + self.c_obj.set_prefix(prefix.encode()) + + cpdef void set_use_cols_indexes(self, list col_indices): + """ + Sets indexes of columns to read. + + Parameters + ---------- + col_indices : list[int] + List of column indices that are needed + + Returns + ------- + None + """ + cdef vector[int] vec + for i in col_indices: + vec.push_back(i) + self.c_obj.set_use_cols_indexes(vec) + + cpdef void set_use_cols_names(self, list col_names): + """ + Sets names of the columns to be read. + + Parameters + ---------- + col_names : list[str] + List of column indices that are needed + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_use_cols_names(vec) + + cpdef void set_delimiter(self, str delimiter): + """ + Sets field delimiter. + + Parameters + ---------- + delimiter : str + A character to indicate delimiter + + Returns + ------- + None + """ + self.c_obj.set_delimiter(ord(delimiter)) + + cpdef void set_thousands(self, str thousands): + """ + Sets numeric data thousands separator. + + Parameters + ---------- + thousands : str + A character that separates thousands + + Returns + ------- + None + """ + self.c_obj.set_thousands(ord(thousands)) + + cpdef void set_comment(self, str comment): + """ + Sets comment line start character. + + Parameters + ---------- + comment : str + A character that indicates comment + + Returns + ------- + None + """ + self.c_obj.set_comment(ord(comment)) + + cpdef void set_parse_dates(self, list val): + """ + Sets indexes or names of columns to read as datetime. + + Parameters + ---------- + val : list[int | str] + List column indices or names to infer as datetime. + + Returns + ------- + None + """ + cdef vector[string] vec_str + cdef vector[int] vec_int + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") + else: + for date in val: + if isinstance(date, str): + vec_str.push_back(date.encode()) + else: + vec_int.push_back(date) + self.c_obj.set_parse_dates(vec_str) + self.c_obj.set_parse_dates(vec_int) + + cpdef void set_parse_hex(self, list val): + """ + Sets indexes or names of columns to parse as hexadecimal. + + Parameters + ---------- + val : list[int | str] + List of column indices or names to parse as hexadecimal + + Returns + ------- + None + """ + cdef vector[string] vec_str + cdef vector[int] vec_int + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") else: - int_cols.push_back(col) - return str_cols, int_cols - -cdef vector[string] _make_str_vector(list vals): - cdef vector[string] res - for val in vals: - res.push_back((val).encode()) - return res - - -def read_csv( - SourceInfo source_info, - *, - compression_type compression = compression_type.AUTO, - size_t byte_range_offset = 0, - size_t byte_range_size = 0, - list col_names = None, - str prefix = "", - bool mangle_dupe_cols = True, - list usecols = None, - size_type nrows = -1, - size_type skiprows = 0, - size_type skipfooter = 0, - size_type header = 0, - str lineterminator = "\n", - str delimiter = None, - str thousands = None, - str decimal = ".", - str comment = None, - bool delim_whitespace = False, - bool skipinitialspace = False, - bool skip_blank_lines = True, - quote_style quoting = quote_style.MINIMAL, - str quotechar = '"', - bool doublequote = True, - list parse_dates = None, - list parse_hex = None, - # Technically this should be dict/list - # but using a fused type prevents using None as default - object dtypes = None, - list true_values = None, - list false_values = None, - list na_values = None, - bool keep_default_na = True, - bool na_filter = True, - bool dayfirst = False, - # Note: These options are supported by the libcudf reader - # but are not exposed here since there is no demand for them - # on the Python side yet. - # bool detect_whitespace_around_quotes = False, - # DataType timestamp_type = DataType(type_id.EMPTY), + for hx in val: + if isinstance(hx, str): + vec_str.push_back(hx.encode()) + else: + vec_int.push_back(hx) + + self.c_obj.set_parse_hex(vec_str) + self.c_obj.set_parse_hex(vec_int) + + cpdef void set_dtypes(self, object types): + """ + Sets per-column types. + + Parameters + ---------- + types : dict[str, data_type] | list[data_type] + Column name to data type map specifying the columns' target data types. + Or a list specifying the columns' target data types. + + Returns + ------- + None + """ + cdef map[string, data_type] dtype_map + cdef vector[data_type] dtype_list + if isinstance(types, dict): + for name, dtype in types.items(): + dtype_map[str(name).encode()] = (dtype).c_obj + self.c_obj.set_dtypes(dtype_map) + elif isinstance(types, list): + for dtype in types: + dtype_list.push_back((dtype).c_obj) + self.c_obj.set_dtypes(dtype_list) + else: + raise TypeError("Must pass an dict or list") + + cpdef void set_true_values(self, list true_values): + """ + Sets additional values to recognize as boolean true values. + + Parameters + ---------- + true_values : list[str] + List of values to be considered to be true + + Returns + ------- + None + """ + cdef vector[string] vec + for val in true_values: + vec.push_back(val.encode()) + self.c_obj.set_true_values(vec) + + cpdef void set_false_values(self, list false_values): + """ + Sets additional values to recognize as boolean false values. + + Parameters + ---------- + false_values : list[str] + List of values to be considered to be false + + Returns + ------- + None + """ + cdef vector[string] vec + for val in false_values: + vec.push_back(val.encode()) + self.c_obj.set_false_values(vec) + + cpdef void set_na_values(self, list na_values): + """ + Sets additional values to recognize as null values. + + Parameters + ---------- + na_values : list[str] + List of values to be considered to be null + + Returns + ------- + None + """ + cdef vector[string] vec + for val in na_values: + vec.push_back(val.encode()) + self.c_obj.set_na_values(vec) + + +cdef class CsvReaderOptionsBuilder: + """ + Builder to build options for ``read_csv`` + + For details, see :cpp:class:`cudf::io::csv_reader_options_builder` + """ + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression): + """ + Sets compression format of the source. + + Parameters + ---------- + compression : compression_type + Compression type + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.compression(compression) + return self + + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols): + """ + Sets whether to rename duplicate column names. + + Parameters + ---------- + mangle_dupe_cols : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.mangle_dupe_cols(mangle_dupe_cols) + return self + + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + """ + Sets number of bytes to skip from source start. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes of offset + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.byte_range_offset(byte_range_offset) + return self + + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + """ + Sets number of bytes to read. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes to read + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.byte_range_size(byte_range_size) + return self + + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows): + """ + Sets number of rows to read. + + Parameters + ---------- + nrows : size_type + Number of rows to read + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.nrows(nrows) + return self + + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows): + """ + Sets number of rows to skip from start. + + Parameters + ---------- + skiprows : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skiprows(skiprows) + return self + + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter): + """ + Sets number of rows to skip from end. + + Parameters + ---------- + skipfooter : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skipfooter(skipfooter) + return self + + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting): + """ + Sets quoting style. + + Parameters + ---------- + quoting : quote_style + Quoting style used + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.quoting(quoting) + return self + + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator): + """ + Sets line terminator. + + Parameters + ---------- + quoting : str + A character to indicate line termination + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.lineterminator(ord(lineterminator)) + return self + + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar): + """ + Sets quoting character. + + Parameters + ---------- + quotechar : str + A character to indicate quoting + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.quotechar(ord(quotechar)) + return self + + cpdef CsvReaderOptionsBuilder decimal(self, str decimal): + """ + Sets decimal point character. + + Parameters + ---------- + quotechar : str + A character that indicates decimal values + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.decimal(ord(decimal)) + return self + + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace): + """ + Sets whether to treat whitespace as field delimiter. + + Parameters + ---------- + delim_whitespace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.delim_whitespace(delim_whitespace) + return self + + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace): + """ + Sets whether to skip whitespace after the delimiter. + + Parameters + ---------- + skipinitialspace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skipinitialspace(skipinitialspace) + return self + + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines): + """ + Sets whether to ignore empty lines or parse line values as invalid. + + Parameters + ---------- + skip_blank_lines : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skip_blank_lines(skip_blank_lines) + return self + + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote): + """ + Sets a quote inside a value is double-quoted. + + Parameters + ---------- + doublequote : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.doublequote(doublequote) + return self + + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na): + """ + Sets whether to keep the built-in default NA values. + + Parameters + ---------- + keep_default_na : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.keep_default_na(keep_default_na) + return self + + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter): + """ + Sets whether to disable null filter. + + Parameters + ---------- + na_filter : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.na_filter(na_filter) + return self + + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst): + """ + Sets whether to parse dates as DD/MM versus MM/DD. + + Parameters + ---------- + dayfirst : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.dayfirst(dayfirst) + return self + + cpdef CsvReaderOptions build(self): + """Create a CsvReaderOptions object""" + cdef CsvReaderOptions csv_options = CsvReaderOptions.__new__( + CsvReaderOptions + ) + csv_options.c_obj = move(self.c_obj.build()) + csv_options.source = self.source + return csv_options + + +cpdef TableWithMetadata read_csv( + CsvReaderOptions options ): - """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. + """ + Read from CSV format. + + The source to read from and options are encapsulated + by the `options` object. For details, see :cpp:func:`read_csv`. Parameters ---------- - source_info : SourceInfo - The SourceInfo to read the CSV file from. - compression : compression_type, default CompressionType.AUTO - The compression format of the CSV source. - byte_range_offset : size_type, default 0 - Number of bytes to skip from source start. - byte_range_size : size_type, default 0 - Number of bytes to read. By default, will read all bytes. - col_names : list, default None - The column names to use. - prefix : string, default '' - The prefix to apply to the column names. - mangle_dupe_cols : bool, default True - If True, rename duplicate column names. - usecols : list, default None - Specify the string column names/integer column indices of columns to be read. - nrows : size_type, default -1 - The number of rows to read. - skiprows : size_type, default 0 - The number of rows to skip from the start before reading - skipfooter : size_type, default 0 - The number of rows to skip from the end - header : size_type, default 0 - The index of the row that will be used for header names. - Pass -1 to use default column names. - lineterminator : str, default '\\n' - The character used to determine the end of a line. - delimiter : str, default "," - The character used to separate fields in a row. - thousands : str, default None - The character used as the thousands separator. - Cannot match delimiter. - decimal : str, default '.' - The character used as the decimal separator. - Cannot match delimiter. - comment : str, default None - The character used to identify the start of a comment line. - (which will be skipped by the reader) - delim_whitespace : bool, default False - If True, treat whitespace as the field delimiter. - skipinitialspace : bool, default False - If True, skip whitespace after the delimiter. - skip_blank_lines : bool, default True - If True, ignore empty lines (otherwise line values are parsed as null). - quoting : QuoteStyle, default QuoteStyle.MINIMAL - The quoting style used in the input CSV data. One of - { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE } - quotechar : str, default '"' - The character used to indicate quoting. - doublequote : bool, default True - If True, a quote inside a value is double-quoted. - parse_dates : list, default None - A list of integer column indices/string column names - of columns to read as datetime. - parse_hex : list, default None - A list of integer column indices/string column names - of columns to read as hexadecimal. - dtypes : Union[Dict[str, DataType], List[DataType]], default None - A list of data types or a dictionary mapping column names - to a DataType. - true_values : List[str], default None - A list of additional values to recognize as True. - false_values : List[str], default None - A list of additional values to recognize as False. - na_values : List[str], default None - A list of additional values to recognize as null. - keep_default_na : bool, default True - Whether to keep the built-in default N/A values. - na_filter : bool, default True - Whether to detect missing values. If False, can - improve performance. - dayfirst : bool, default False - If True, interpret dates as being in the DD/MM format. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + options: CsvReaderOptions + Settings for controlling reading behavior """ - cdef vector[string] c_parse_dates_names - cdef vector[int] c_parse_dates_indexes - cdef vector[int] c_parse_hex_names - cdef vector[int] c_parse_hex_indexes - cdef vector[data_type] c_dtypes_list - cdef map[string, data_type] c_dtypes_map - - cdef csv_reader_options options = ( - csv_reader_options.builder(source_info.c_obj) - .compression(compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(byte_range_offset) - .byte_range_size(byte_range_size) - .nrows(nrows) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(quoting) - .lineterminator(ord(lineterminator)) - .quotechar(ord(quotechar)) - .decimal(ord(decimal)) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - options.set_header(header) - - if col_names is not None: - options.set_names([str(name).encode() for name in col_names]) - - if prefix is not None: - options.set_prefix(prefix.encode()) - - if usecols is not None: - if all([isinstance(col, int) for col in usecols]): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name).encode() for name in usecols]) - - if delimiter is not None: - options.set_delimiter(ord(delimiter)) - - if thousands is not None: - options.set_thousands(ord(thousands)) - - if comment is not None: - options.set_comment(ord(comment)) - - if parse_dates is not None: - if not all([isinstance(col, (str, int)) for col in parse_dates]): - raise NotImplementedError( - "`parse_dates`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_dates_names, c_parse_dates_indexes = \ - _process_parse_dates_hex(parse_dates) - options.set_parse_dates(c_parse_dates_names) - options.set_parse_dates(c_parse_dates_indexes) - - if parse_hex is not None: - if not all([isinstance(col, (str, int)) for col in parse_hex]): - raise NotImplementedError( - "`parse_hex`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex) - options.set_parse_hex(c_parse_hex_names) - options.set_parse_hex(c_parse_hex_indexes) - - if isinstance(dtypes, list): - for dtype in dtypes: - c_dtypes_list.push_back((dtype).c_obj) - options.set_dtypes(c_dtypes_list) - elif isinstance(dtypes, dict): - # dtypes_t is dict - for k, v in dtypes.items(): - c_dtypes_map[str(k).encode()] = (v).c_obj - options.set_dtypes(c_dtypes_map) - elif dtypes is not None: - raise TypeError("dtypes must either by a list/dict") - - if true_values is not None: - options.set_true_values(_make_str_vector(true_values)) - - if false_values is not None: - options.set_false_values(_make_str_vector(false_values)) - - if na_values is not None: - options.set_na_values(_make_str_vector(na_values)) - cdef table_with_metadata c_result with nogil: - c_result = move(cpp_read_csv(options)) + c_result = move(cpp_read_csv(options.c_obj)) - return TableWithMetadata.from_libcudf(c_result) + cdef TableWithMetadata tbl_meta = TableWithMetadata.from_libcudf(c_result) + return tbl_meta # TODO: Implement the remaining methods diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 90d2d0896a5..1cbaac57315 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -77,14 +77,16 @@ def test_read_csv_basic( offset=skiprows, length=nrows if nrows != -1 else None ) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - delimiter=delimiter, - compression=compression_type, - col_names=column_names, - nrows=nrows, - skiprows=skiprows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(compression_type) + .nrows(nrows) + .skiprows(skiprows) + .build() ) + options.set_delimiter(delimiter) + options.set_names([str(name) for name in column_names]) + res = plc.io.csv.read_csv(options) assert_table_and_meta_eq( pa_table, @@ -110,15 +112,15 @@ def test_read_csv_byte_range(table_data, chunk_size, tmp_path): file_size = os.stat(source).st_size tbls_w_meta = [] for segment in range((file_size + chunk_size - 1) // chunk_size): - tbls_w_meta.append( - plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - byte_range_offset=segment * chunk_size, - byte_range_size=chunk_size, - header=-1, - col_names=pa_table.column_names, - ) + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .byte_range_offset(segment * chunk_size) + .byte_range_size(chunk_size) + .build() ) + options.set_header(-1) + options.set_names([str(name) for name in pa_table.column_names]) + tbls_w_meta.append(plc.io.csv.read_csv(options)) if isinstance(source, io.IOBase): source.seek(0) exp = pd.read_csv(source, names=pa_table.column_names, header=None) @@ -161,9 +163,16 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): new_schema = pa.schema(new_fields) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_dtypes(dtypes) + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + res = plc.io.csv.read_csv(options) new_table = pa_table.cast(new_schema) assert_table_and_meta_eq(new_table, res) @@ -171,7 +180,7 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): @pytest.mark.parametrize("skip_blanks", [True, False]) @pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')]) -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +@pytest.mark.parametrize("lineterminator", ["\n", "\t"]) def test_read_csv_parse_options( source_or_sink, decimal, quotechar, skip_blanks, lineterminator ): @@ -188,19 +197,25 @@ def test_read_csv_parse_options( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - comment="#", - decimal=decimal, - skip_blank_lines=skip_blanks, - quotechar=quotechar, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .lineterminator(lineterminator) + .quotechar(quotechar) + .decimal(decimal) + .skip_blank_lines(skip_blanks) + .build() ) + options.set_comment("#") + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), comment="#", decimal=decimal, skip_blank_lines=skip_blanks, quotechar=quotechar, + lineterminator=lineterminator, ) assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta) @@ -216,12 +231,17 @@ def test_read_csv_na_values( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - na_filter=na_filter, - na_values=na_values if na_filter else None, - keep_default_na=keep_default_na, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .build() ) + if na_filter and na_values is not None: + options.set_na_values(na_values) + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), na_filter=na_filter, @@ -241,9 +261,11 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): **_COMMON_CSV_SOURCE_KWARGS, ) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), header=header - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_header(header) + plc_table_w_meta = plc.io.csv.read_csv(options) if header > 0: if header < len(pa_table): names_row = pa_table.take([header - 1]).to_pylist()[0].values()