diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 914731ea417..f3e1a61d075 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base, // NOLINT void>* = nullptr> constexpr host_span(host_span const& other) noexcept - : base(other.data(), other.size()) + : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()} { } diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 312a5243687..d276e946a51 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -309,10 +309,10 @@ class parquet_field_struct : public parquet_field { template class parquet_field_union_struct : public parquet_field { E& enum_val; - cuda::std::optional& val; // union structs are always wrapped in std::optional + std::optional& val; // union structs are always wrapped in std::optional public: - parquet_field_union_struct(int f, E& ev, cuda::std::optional& v) + parquet_field_union_struct(int f, E& ev, std::optional& v) : parquet_field(f), enum_val(ev), val(v) { } @@ -439,10 +439,10 @@ class parquet_field_struct_blob : public parquet_field { */ template class parquet_field_optional : public parquet_field { - cuda::std::optional& val; + std::optional& val; public: - parquet_field_optional(int f, cuda::std::optional& v) : parquet_field(f), val(v) {} + parquet_field_optional(int f, std::optional& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 7c985643887..2851ef67a65 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -20,8 +20,6 @@ #include -#include - #include #include #include @@ -94,10 +92,10 @@ struct LogicalType { BSON }; Type type; - cuda::std::optional decimal_type; - cuda::std::optional time_type; - cuda::std::optional timestamp_type; - cuda::std::optional int_type; + std::optional decimal_type; + std::optional time_type; + std::optional timestamp_type; + std::optional int_type; LogicalType(Type tp = UNDEFINED) : type(tp) {} LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} @@ -178,21 +176,21 @@ struct SchemaElement { // 5: nested fields int32_t num_children = 0; // 6: DEPRECATED: record the original type before conversion to parquet type - cuda::std::optional converted_type; + std::optional converted_type; // 7: DEPRECATED: record the scale for DECIMAL converted type int32_t decimal_scale = 0; // 8: DEPRECATED: record the precision for DECIMAL converted type int32_t decimal_precision = 0; // 9: save field_id from original schema - cuda::std::optional field_id; + std::optional field_id; // 10: replaces converted type - cuda::std::optional logical_type; + std::optional logical_type; // extra cudf specific fields bool output_as_byte_array = false; // cudf type determined from arrow:schema - cuda::std::optional arrow_type; + std::optional arrow_type; // The following fields are filled in later during schema initialization int max_definition_level = 0; @@ -258,21 +256,21 @@ struct SchemaElement { */ struct Statistics { // deprecated max value in signed comparison order - cuda::std::optional> max; + std::optional> max; // deprecated min value in signed comparison order - cuda::std::optional> min; + std::optional> min; // count of null values in the column - cuda::std::optional null_count; + std::optional null_count; // count of distinct values occurring - cuda::std::optional distinct_count; + std::optional distinct_count; // max value for column determined by ColumnOrder - cuda::std::optional> max_value; + std::optional> max_value; // min value for column determined by ColumnOrder - cuda::std::optional> min_value; + std::optional> min_value; // If true, max_value is the actual maximum value for a column - cuda::std::optional is_max_value_exact; + std::optional is_max_value_exact; // If true, min_value is the actual minimum value for a column - cuda::std::optional is_min_value_exact; + std::optional is_min_value_exact; }; /** @@ -281,7 +279,7 @@ struct Statistics { struct SizeStatistics { // Number of variable-width bytes stored for the page/chunk. Should not be set for anything // but the BYTE_ARRAY physical type. - cuda::std::optional unencoded_byte_array_data_bytes; + std::optional unencoded_byte_array_data_bytes; /** * When present, there is expected to be one element corresponding to each * repetition (i.e. size=max repetition_level+1) where each element @@ -290,14 +288,14 @@ struct SizeStatistics { * * This value should not be written if max_repetition_level is 0. */ - cuda::std::optional> repetition_level_histogram; + std::optional> repetition_level_histogram; /** * Same as repetition_level_histogram except for definition levels. * * This value should not be written if max_definition_level is 0 or 1. */ - cuda::std::optional> definition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -318,7 +316,7 @@ struct OffsetIndex { std::vector page_locations; // per-page size info. see description of the same field in SizeStatistics. only present for // columns with a BYTE_ARRAY physical type. - cuda::std::optional> unencoded_byte_array_data_bytes; + std::optional> unencoded_byte_array_data_bytes; }; /** @@ -329,11 +327,11 @@ struct ColumnIndex { std::vector> min_values; // lower bound for values in each page std::vector> max_values; // upper bound for values in each page BoundaryOrder boundary_order = - BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered - cuda::std::optional> null_counts; // Optional count of null values per page + BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered + std::optional> null_counts; // Optional count of null values per page // Repetition/definition level histograms for the column chunk - cuda::std::optional> repetition_level_histogram; - cuda::std::optional> definition_level_histogram; + std::optional> repetition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -383,11 +381,11 @@ struct ColumnChunkMetaData { Statistics statistics; // Set of all encodings used for pages in this column chunk. This information can be used to // determine if all data pages are dictionary encoded for example. - cuda::std::optional> encoding_stats; + std::optional> encoding_stats; // Optional statistics to help estimate total memory when converted to in-memory representations. // The histograms contained in these statistics can also be useful in some cases for more // fine-grained nullability/list length filter pushdown. - cuda::std::optional size_statistics; + std::optional size_statistics; }; /** @@ -429,13 +427,13 @@ struct RowGroup { int64_t num_rows = 0; // If set, specifies a sort ordering of the rows in this RowGroup. // The sorting columns can be a subset of all the columns. - cuda::std::optional> sorting_columns; + std::optional> sorting_columns; // Byte offset from beginning of file to first page (data or dictionary) in this row group - cuda::std::optional file_offset; + std::optional file_offset; // Total byte size of all compressed (and potentially encrypted) column data in this row group - cuda::std::optional total_compressed_size; + std::optional total_compressed_size; // Row group ordinal in the file - cuda::std::optional ordinal; + std::optional ordinal; }; /** @@ -460,7 +458,7 @@ struct FileMetaData { std::vector row_groups; std::vector key_value_metadata; std::string created_by = ""; - cuda::std::optional> column_orders; + std::optional> column_orders; }; /** diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index a8ba3a969ce..4f6d41a97da 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -395,7 +395,7 @@ struct ColumnChunkDesc { uint8_t def_level_bits_, uint8_t rep_level_bits_, Compression codec_, - cuda::std::optional logical_type_, + std::optional logical_type_, int32_t ts_clock_rate_, int32_t src_col_index_, int32_t src_col_schema_, @@ -441,12 +441,12 @@ struct ColumnChunkDesc { int32_t num_data_pages{}; // number of data pages int32_t num_dict_pages{}; // number of dictionary pages PageInfo const* dict_page{}; - string_index_pair* str_dict_index{}; // index for string dictionary - bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column - void** column_data_base{}; // base pointers of column data - void** column_string_base{}; // base pointers of column string data - Compression codec{}; // compressed codec enum - cuda::std::optional logical_type{}; // logical type + string_index_pair* str_dict_index{}; // index for string dictionary + bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column + void** column_data_base{}; // base pointers of column data + void** column_string_base{}; // base pointers of column string data + Compression codec{}; // compressed codec enum + std::optional logical_type{}; // logical type int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index b90ca36c8c7..f0a0bc0b51b 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -152,7 +152,7 @@ struct stats_caster { } void set_index(size_type index, - cuda::std::optional> const& binary_value, + std::optional> const& binary_value, Type const type) { if (binary_value.has_value()) { @@ -234,8 +234,8 @@ struct stats_caster { max.set_index(stats_idx, max_value, colchunk.meta_data.type); } else { // Marking it null, if column present in row group - min.set_index(stats_idx, cuda::std::nullopt, {}); - max.set_index(stats_idx, cuda::std::nullopt, {}); + min.set_index(stats_idx, std::nullopt, {}); + max.set_index(stats_idx, std::nullopt, {}); } stats_idx++; } diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 1b69ccb7742..f0865c715bc 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -38,7 +38,7 @@ namespace { // be treated as a string. Currently the only logical type that has special handling is DECIMAL. // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which // for now would also be treated as a string). -inline bool is_treat_fixed_length_as_string(cuda::std::optional const& logical_type) +inline bool is_treat_fixed_length_as_string(std::optional const& logical_type) { if (!logical_type.has_value()) { return true; } return logical_type->type != LogicalType::DECIMAL; diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index c588fedb85c..27312a4da89 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos, * * @return A tuple of Parquet clock rate and Parquet decimal type. */ -[[nodiscard]] std::tuple> conversion_info( +[[nodiscard]] std::tuple> conversion_info( type_id column_type_id, type_id timestamp_type_id, Type physical, - cuda::std::optional logical_type) + std::optional logical_type) { int32_t const clock_rate = is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0; @@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos, // if decimal but not outputting as float or decimal, then convert to no logical type if (column_type_id != type_id::FLOAT64 and not cudf::is_fixed_point(data_type{column_type_id})) { - return std::make_tuple(clock_rate, cuda::std::nullopt); + return {clock_rate, std::nullopt}; } } - return std::make_tuple(clock_rate, std::move(logical_type)); + return {clock_rate, std::move(logical_type)}; } /** diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 6d566b5815e..a6562d33de2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf; namespace { -cuda::std::optional converted_to_logical_type(SchemaElement const& schema) +std::optional converted_to_logical_type(SchemaElement const& schema) { if (schema.converted_type.has_value()) { switch (schema.converted_type.value()) { @@ -66,7 +66,7 @@ cuda::std::optional converted_to_logical_type(SchemaElement const& default: return LogicalType{LogicalType::UNDEFINED}; } } - return cuda::std::nullopt; + return std::nullopt; } } // namespace @@ -246,7 +246,7 @@ void metadata::sanitize_schema() struct_elem.repetition_type = REQUIRED; struct_elem.num_children = schema_elem.num_children; struct_elem.type = UNDEFINED_TYPE; - struct_elem.converted_type = cuda::std::nullopt; + struct_elem.converted_type = std::nullopt; // swap children struct_elem.children_idx = std::move(schema_elem.children_idx); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ec05f35d405..190f13eb688 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -186,7 +186,7 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - cuda::std::optional> column_orders = cuda::std::nullopt; + std::optional> column_orders = std::nullopt; }; namespace { @@ -472,7 +472,7 @@ struct leaf_schema_fn { std::enable_if_t, void> operator()() { col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; if (timestamp_is_int96) { col_schema.ts_scale = -1000; // negative value indicates division by absolute value @@ -750,7 +750,7 @@ std::vector construct_parquet_schema_tree( col_schema.type = Type::BYTE_ARRAY; } - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_byte_array; col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -2795,7 +2795,7 @@ std::unique_ptr> writer::merge_row_group_metadata( // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615 for (auto& se : md.schema) { if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) { - se.logical_type = cuda::std::nullopt; + se.logical_type = std::nullopt; } } diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 6141a40bc95..a1b8677eac8 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2) int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype) + std::optional const& ctype) { auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); switch (ptype) { diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index bd1579eaa1b..c90b81ed27a 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -172,7 +172,7 @@ std::pair create_parquet_typed_with_stats(std::string int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype); + std::optional const& ctype); void expect_compression_stats_empty(std::shared_ptr stats); diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index ae7c6fa8b8c..1e1e21fe18a 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -125,3 +126,22 @@ TEST_F(PinnedMemoryTest, MakeHostVector) EXPECT_FALSE(vec.get_allocator().is_device_accessible()); } } + +TEST_F(PinnedMemoryTest, HostSpan) +{ + auto test_ctors = [](auto&& vec) { + auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible(); + // Test conversion from a vector + auto const span = cudf::host_span{vec}; + EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible); + // Test conversion from host_span with different type + auto const span_converted = cudf::host_span{span}; + EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible); + }; + + cudf::set_allocate_host_as_pinned_threshold(7); + for (int i = 1; i < 10; i++) { + // some iterations will use pinned memory, some will not + test_ctors(cudf::detail::make_host_vector(i, cudf::get_default_stream())); + } +} diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 95813907bf4..ecf619ddc44 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -342,7 +342,10 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), - "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), + # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer + # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory + # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv + "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"), } diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index 34b657488c1..5024747227e 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of: ``` - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version 24.02 of cudf was the last to support pandas 1.5.x. +- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy + array, we create a proxy type that actually subclasses `numpy.ndarray`. We can + verify this with an isinstance check. + + ```python + %load_ext cudf.pandas + import pandas as pd + import numpy as np + + arr = pd.Series([1, 1, 2]).unique() # returns a proxy array + isinstance(arr, np.ndarray) # returns True, where arr is a proxy array + ``` + Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to + access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API. + However, our proxy mechanism is designed to proxy function calls at the Python + level, which is incompatible with these types of accesses. To handle these + situations, we perform an eager device-to-host (DtoH) copy, which sets the data + buffer correctly but incurs the cost of extra time when creating the proxy array. + In the previous example, creating `arr` performed this kind of implicit DtoH transfer. + + With this approach, we also get compatibility with third party libraries like `torch`. + + ```python + import torch + x = torch.from_numpy(arr) + ``` ## Can I force running on the CPU? diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8ceea4920e2..8b1d16f0d85 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport ( mutable_column_view, ) from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef class Column: diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 99e4c21df8a..065655505b8 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -28,7 +28,7 @@ from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.types cimport ( dtype_from_column_view, diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 49714091f46..30353c4be6c 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -8,7 +8,7 @@ from libcpp.memory cimport make_shared, shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer import pylibcudf diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 27095ca02d4..0f9820ed1db 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -4,7 +4,7 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.scalar.scalar cimport scalar -from rmm._lib.memory_resource cimport DeviceMemoryResource +from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource cdef class DeviceScalar: diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index d9595f4ab0a..93b67bd4c9d 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -10,10 +10,6 @@ from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - from_floats as cpp_from_floats, - to_floats as cpp_to_floats, -) from pylibcudf.libcudf.strings.convert.convert_integers cimport ( from_integers as cpp_from_integers, hex_to_integers as cpp_hex_to_integers, @@ -33,32 +29,18 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type def floating_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_floats( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_floats.from_floats( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def string_to_floating(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_floats.to_floats( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type) ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_floats( - input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def dtos(Column input_col): diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx index 7965b588703..5da6e3f10cc 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - is_float as cpp_is_float, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def is_float(Column source_strings): @@ -20,12 +13,7 @@ def is_float(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have floats. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_float( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_floats.is_float( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index 78fc9f08bd8..dd2fafbe07f 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -23,7 +23,8 @@ from pylibcudf.libcudf.strings_udf cimport ( to_string_view_array as cpp_to_string_view_array, udf_string, ) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.column cimport Column diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 32ae8c5ee53..caff019f575 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -284,7 +284,7 @@ def memoryview( """Read-only access to the buffer through host memory.""" size = self._size if size is None else size host_buf = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self.get_ptr(mode="read") + offset, host_buf ) return memoryview(host_buf).toreadonly() diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 4c9e524ee05..b40c56c9a6b 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -207,7 +207,7 @@ def spill(self, target: str = "cpu") -> None: domain="cudf_python-spill", ): host_mem = host_memory_allocation(self.size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self._ptr, host_mem ) self._ptr_desc["memoryview"] = host_mem @@ -352,7 +352,7 @@ def memoryview( else: assert self._ptr_desc["type"] == "gpu" ret = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self._ptr + offset, ret ) return ret diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 43604ab21a7..a0cbe7ada19 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -99,7 +99,7 @@ def prepare_args(self, ty, val, **kwargs): ty.dtype, (StringView, UDFString) ): return types.uint64, val.ptr if isinstance( - val, rmm._lib.device_buffer.DeviceBuffer + val, rmm.pylibrmm.device_buffer.DeviceBuffer ) else val.get_ptr(mode="read") else: return ty, val diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index a37a12fc7e1..03808f0b664 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index 25664286f19..b019ed8f099 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport null_equality -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer from .column cimport Column from .table cimport Table diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd index 7a369701bbd..76f35cbba71 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd @@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column_view cimport ( ) from pylibcudf.libcudf.types cimport data_type, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd index f1a326bcd40..b2388858127 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.types cimport ( type_id, ) -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index 92f5a185a54..a09b6c01392 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.table.table cimport table, table_view from pylibcudf.libcudf.utilities.host_span cimport host_span -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd index e19e7fff334..12090af16cc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd @@ -6,7 +6,7 @@ from libcpp.vector cimport vector from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd index 4d4a4ba9b89..e6e719d6436 100644 --- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd @@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd index 6f6c145b23c..21033a0284e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd @@ -9,7 +9,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport null_equality, size_type -from rmm._lib.device_uvector cimport device_uvector +from rmm.librmm.device_uvector cimport device_uvector ctypedef unique_ptr[device_uvector[size_type]] gather_map_type ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd index 5f582091b06..27af4a3bdb1 100644 --- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd @@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd index f4fc4674506..a45c7f9979e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd @@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_floats( - column_view input_col, + column_view strings, data_type output_type) except + cdef unique_ptr[column] from_floats( - column_view input_col) except + + column_view floats) except + cdef unique_ptr[column] is_float( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd index 0c8fe1060ac..2eca043e451 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd index 38298a7c1f1..d21510bd731 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd index ab5c0080312..9bdfaee2842 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/null_mask.pxd @@ -2,7 +2,7 @@ from pylibcudf.libcudf.types cimport mask_state, size_type -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .column cimport Column diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 5bdde06f21f..aae39987dac 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -6,7 +6,8 @@ from libcpp.utility cimport move from pylibcudf.libcudf cimport null_mask as cpp_null_mask from pylibcudf.libcudf.types cimport mask_state, size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint @@ -31,8 +32,8 @@ cpdef DeviceBuffer copy_bitmask(Column col): Returns ------- rmm.DeviceBuffer - A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer`` - if ``col`` is not nullable + A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty + ``DeviceBuffer`` if ``col`` is not nullable """ cdef device_buffer db diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd index 8664dfa4b7e..a273647c98d 100644 --- a/python/pylibcudf/pylibcudf/scalar.pxd +++ b/python/pylibcudf/pylibcudf/scalar.pxd @@ -4,7 +4,7 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.scalar.scalar cimport scalar -from rmm._lib.memory_resource cimport DeviceMemoryResource +from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource from .column cimport Column from .types cimport DataType diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 3e20938af0c..d4888a62ad1 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -6,7 +6,7 @@ from libcpp.utility cimport move from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like -from rmm._lib.memory_resource cimport get_current_device_resource +from rmm.pylibrmm.memory_resource cimport get_current_device_resource from .column cimport Column from .types cimport DataType diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 41aeb72039b..7b228c06a18 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx - convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx + convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index b4b0b521e39..be6145384ad 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( convert_datetime, convert_durations, convert_fixed_point, + convert_floats, convert_ipv4, convert_urls, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index 409620fce45..7c94387282b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -4,6 +4,7 @@ convert_datetime, convert_durations, convert_fixed_point, + convert_floats, convert_ipv4, convert_urls, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd new file mode 100644 index 00000000000..1284ff552aa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_floats(Column strings, DataType output_type) + +cpdef Column from_floats(Column floats) + +cpdef Column is_float(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx new file mode 100644 index 00000000000..8081aadb085 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -0,0 +1,101 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_floats as cpp_convert_floats, +) +from pylibcudf.types cimport DataType + + +cpdef Column to_floats(Column strings, DataType output_type): + """ + Returns a new numeric column by parsing float values from each string + in the provided strings column. + + For details, see cpp:func:`cudf::strings::to_floats` + + Parameters + ---------- + strings : Column + Strings instance for this operation. + + output_type : DataType + Type of float numeric column to return. + + Returns + ------- + Column + New column with floats converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_floats.to_floats( + strings.view(), + output_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_floats(Column floats): + """ + Returns a new strings column converting the float values from the + provided column into strings. + + For details, see cpp:func:`cudf::strings::from_floats` + + Parameters + ---------- + floats : Column + Numeric column to convert. + + Returns + ------- + Column + New strings column with floats as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_floats.from_floats( + floats.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_float(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to floats. + + For details, see cpp:func:`cudf::strings::is_float` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_floats.is_float( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py new file mode 100644 index 00000000000..e9918fab559 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_floats(): + typ = pa.float32() + arr = pa.array(["-1.23", "1", None]) + result = plc.strings.convert.convert_floats.to_floats( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_floats(): + arr = pa.array([-1.23, 1, None]) + result = plc.strings.convert.convert_floats.from_floats( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["-1.23", "1.0", None]) + assert_column_eq(result, expected) + + +def test_is_float(): + arr = pa.array(["-1.23", "1", "1.2.3", "A", None]) + result = plc.strings.convert.convert_floats.is_float( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index de425a27c15..74134caeb78 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -9,7 +9,8 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .column cimport Column from .gpumemoryview cimport gpumemoryview