From 19814459fb31ecf628d40b3b542c1c4c718842c8 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Nov 2024 09:57:32 -0500 Subject: [PATCH 01/40] Fix extract-datetime deprecation warning in ndsh benchmark (#17254) Fixes deprecation warning introduced by #17221 ``` [165+3+59=226] Building CXX object benchmarks/CMakeFiles/NDSH_Q09_NVBENCH.dir/ndsh/q09.cpp.o /cudf/cpp/benchmarks/ndsh/q09.cpp: In function 'void run_ndsh_q9(nvbench::state&, std::unordered_map, cuio_source_sink_pair>&)': /cudf/cpp/benchmarks/ndsh/q09.cpp:148:33: warning: 'std::unique_ptr cudf::datetime::extract_year(const cudf::column_view&, rmm::cuda_stream_view, rmm::device_async_resource_ref)' is deprecated [-Wdeprecated-declarations] 148 | auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate")); | ^~~~~~~~~~~~ In file included from /cudf/cpp/benchmarks/ndsh/q09.cpp:21: /cudf/cpp/include/cudf/datetime.hpp:70:46: note: declared here 70 | [[deprecated]] std::unique_ptr extract_year( | ^~~~~~~~~~~~ /cudf/cpp/benchmarks/ndsh/q09.cpp:148:45: warning: 'std::unique_ptr cudf::datetime::extract_year(const cudf::column_view&, rmm::cuda_stream_view, rmm::device_async_resource_ref)' is deprecated [-Wdeprecated-declarations] 148 | auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate")); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from /cudf/cpp/benchmarks/ndsh/q09.cpp:21: /cudf/cpp/include/cudf/datetime.hpp:70:46: note: declared here 70 | [[deprecated]] std::unique_ptr extract_year( | ^~~~~~~~~~~~ ``` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/17254 --- cpp/benchmarks/ndsh/q09.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp index 2e9a69d9ee2..98c951101ed 100644 --- a/cpp/benchmarks/ndsh/q09.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -145,7 +145,8 @@ void run_ndsh_q9(nvbench::state& state, // Calculate the `nation`, `o_year`, and `amount` columns auto n_name = std::make_unique(joined_table->column("n_name")); - auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate")); + auto o_year = cudf::datetime::extract_datetime_component( + joined_table->column("o_orderdate"), cudf::datetime::datetime_component::YEAR); auto amount = calculate_amount(joined_table->column("l_discount"), joined_table->column("l_extendedprice"), joined_table->column("ps_supplycost"), From 67c71e295e9f83f6bc2cd90545f023104c487cff Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Nov 2024 10:01:05 -0500 Subject: [PATCH 02/40] Refactor gather/scatter benchmarks for strings (#17223) Combines the `benchmarks/string/copy.cu` and `benchmarks/string/gather.cpp` source files which both had separate gather benchmarks for strings. The result is a new `copy.cpp` that has both gather and scatter benchmarks. Also changes the default parameters to remove the need to restrict the values. Authors: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/17223 --- cpp/benchmarks/CMakeLists.txt | 3 +- cpp/benchmarks/string/copy.cpp | 75 +++++++++++++++++++++++++ cpp/benchmarks/string/copy.cu | 95 -------------------------------- cpp/benchmarks/string/gather.cpp | 60 -------------------- 4 files changed, 76 insertions(+), 157 deletions(-) create mode 100644 cpp/benchmarks/string/copy.cpp delete mode 100644 cpp/benchmarks/string/copy.cu delete mode 100644 cpp/benchmarks/string/gather.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index bdc360c082b..f6a5c97e059 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -358,7 +358,6 @@ ConfigureBench( STRINGS_BENCH string/convert_datetime.cpp string/convert_durations.cpp - string/copy.cu string/factory.cu string/filter.cpp string/repeat_strings.cpp @@ -375,12 +374,12 @@ ConfigureNVBench( string/contains.cpp string/convert_fixed_point.cpp string/convert_numerics.cpp + string/copy.cpp string/copy_if_else.cpp string/copy_range.cpp string/count.cpp string/extract.cpp string/find.cpp - string/gather.cpp string/join_strings.cpp string/lengths.cpp string/like.cpp diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cpp new file mode 100644 index 00000000000..2baccd4fad1 --- /dev/null +++ b/cpp/benchmarks/string/copy.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_copy(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const api = state.get_string("api"); + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const source = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + + data_profile const map_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto const map_table = + create_random_table({cudf::type_to_id()}, row_count{num_rows}, map_profile); + auto const map_view = map_table->view().column(0); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (api == "gather") { + auto result = + cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream); + auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream); + state.add_global_memory_reads(chars_size + + (map_view.size() * sizeof(cudf::size_type))); + state.add_global_memory_writes(chars_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream); + }); + } else if (api == "scatter") { + auto const target = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + auto result = cudf::scatter(source->view(), map_view, target->view(), stream); + auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream); + state.add_global_memory_reads(chars_size + + (map_view.size() * sizeof(cudf::size_type))); + state.add_global_memory_writes(chars_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::scatter(source->view(), map_view, target->view(), stream); + }); + } +} + +NVBENCH_BENCH(bench_copy) + .set_name("copy") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_string_axis("api", {"gather", "scatter"}); diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu deleted file mode 100644 index 6b2f6c3a0a7..00000000000 --- a/cpp/benchmarks/string/copy.cu +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "string_bench_args.hpp" - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -class StringCopy : public cudf::benchmark {}; - -enum copy_type { gather, scatter }; - -static void BM_copy(benchmark::State& state, copy_type ct) -{ - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - - auto const source = - create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile); - auto const target = - create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile); - - // scatter indices - auto index_map_col = make_numeric_column( - cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED); - auto index_map = index_map_col->mutable_view(); - thrust::shuffle_copy(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(n_rows), - index_map.begin(), - thrust::default_random_engine()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (ct) { - case gather: cudf::gather(source->view(), index_map); break; - case scatter: cudf::scatter(source->view(), index_map, target->view()); break; - } - } - - state.SetBytesProcessed( - state.iterations() * - cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream())); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); - - // Benchmark for very small strings - b->Args({67108864, 2}); -} - -#define COPY_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringCopy, name) \ - (::benchmark::State & st) { BM_copy(st, copy_type::name); } \ - BENCHMARK_REGISTER_F(StringCopy, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -COPY_BENCHMARK_DEFINE(gather) -COPY_BENCHMARK_DEFINE(scatter) diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp deleted file mode 100644 index 5b1c679be7d..00000000000 --- a/cpp/benchmarks/string/gather.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include - -#include - -static void bench_gather(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const input_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); - - data_profile const map_profile = data_profile_builder().no_validity().distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows); - auto const map_table = - create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = - cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); // all bytes are read; - state.add_global_memory_writes(chars_size); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = cudf::gather( - input_table->view(), map_table->view().column(0), cudf::out_of_bounds_policy::NULLIFY); - }); -} - -NVBENCH_BENCH(bench_gather) - .set_name("gather") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); From 08e4853f8c0147492d7fa7ff7a183ed989a5b6ba Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 7 Nov 2024 16:31:12 +0100 Subject: [PATCH 03/40] AWS S3 IO through KvikIO (#16499) Implement remote IO read using KvikIO's S3 backend. For now, this is an experimental feature for parquet read only. Enable by defining `CUDF_KVIKIO_REMOTE_IO=ON`. Authors: - Mads R. B. Kristensen (https://github.com/madsbk) - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Vukasin Milovanovic (https://github.com/vuule) - Shruti Shivakumar (https://github.com/shrshi) - Richard (Rick) Zamora (https://github.com/rjzamora) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16499 --- cpp/src/io/utilities/datasource.cpp | 87 ++++++++++++++++++++++++- python/cudf/cudf/options.py | 16 +++++ python/cudf/cudf/tests/test_s3.py | 11 ++++ python/cudf/cudf/utils/ioutils.py | 33 +++++++--- python/pylibcudf/pylibcudf/io/types.pyx | 12 ++-- 5 files changed, 144 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 15a4a270ce0..9ea39e692b6 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -26,6 +26,7 @@ #include #include +#include #include @@ -33,6 +34,7 @@ #include #include +#include #include namespace cudf { @@ -389,6 +391,86 @@ class user_datasource_wrapper : public datasource { datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; +/** + * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. + */ +class remote_file_source : public datasource { + static std::unique_ptr create_s3_endpoint(char const* filepath) + { + auto [bucket_name, bucket_object] = kvikio::S3Endpoint::parse_s3_url(filepath); + return std::make_unique(bucket_name, bucket_object); + } + + public: + explicit remote_file_source(char const* filepath) : _kvikio_file{create_s3_endpoint(filepath)} {} + + ~remote_file_source() override = default; + + [[nodiscard]] bool supports_device_read() const override { return true; } + + [[nodiscard]] bool is_device_read_preferred(size_t size) const override { return true; } + + [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); } + + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_file.pread(dst, read_size, offset); + } + + size_t device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + return device_read_async(offset, size, dst, stream).get(); + } + + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override + { + rmm::device_buffer out_data(size, stream); + size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); + out_data.resize(read, stream); + return datasource::buffer::create(std::move(out_data)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_file.pread(dst, read_size, offset).get(); + } + + std::unique_ptr host_read(size_t offset, size_t size) override + { + auto const count = std::min(size, this->size() - offset); + std::vector h_data(count); + this->host_read(offset, count, h_data.data()); + return datasource::buffer::create(std::move(h_data)); + } + + /** + * @brief Is `url` referring to a remote file supported by KvikIO? + * + * For now, only S3 urls (urls starting with "s3://") are supported. + */ + static bool is_supported_remote_url(std::string const& url) + { + // Regular expression to match "s3://" + std::regex pattern{R"(^s3://)", std::regex_constants::icase}; + return std::regex_search(url, pattern); + } + + private: + kvikio::RemoteHandle _kvikio_file; +}; + } // namespace std::unique_ptr datasource::create(std::string const& filepath, @@ -403,8 +485,9 @@ std::unique_ptr datasource::create(std::string const& filepath, CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy); }(); - - if (use_memory_mapping) { + if (remote_file_source::is_supported_remote_url(filepath)) { + return std::make_unique(filepath.c_str()); + } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); } else { // `file_source` reads the file directly, without memory mapping diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index df7bbe22a61..e206c8bca08 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -351,6 +351,22 @@ def _integer_and_none_validator(val): _make_contains_validator([False, True]), ) +_register_option( + "kvikio_remote_io", + _env_get_bool("CUDF_KVIKIO_REMOTE_IO", False), + textwrap.dedent( + """ + Whether to use KvikIO's remote IO backend or not. + \tWARN: this is experimental and may be removed at any time + \twithout warning or deprecation period. + \tSet KVIKIO_NTHREADS (default is 8) to change the number of + \tconcurrent tcp connections, which is important for good performance. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + class option_context(ContextDecorator): """ diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 0958b68084d..afb82f75bcf 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -69,6 +69,7 @@ def s3_base(endpoint_ip, endpoint_port): # with an S3 endpoint on localhost endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/" + os.environ["AWS_ENDPOINT_URL"] = endpoint_uri server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port) server.start() @@ -105,6 +106,15 @@ def s3_context(s3_base, bucket, files=None): pass +@pytest.fixture( + params=[True, False], + ids=["kvikio=ON", "kvikio=OFF"], +) +def kvikio_remote_io(request): + with cudf.option_context("kvikio_remote_io", request.param): + yield request.param + + @pytest.fixture def pdf(scope="module"): df = pd.DataFrame() @@ -193,6 +203,7 @@ def test_write_csv(s3_base, s3so, pdf, chunksize): def test_read_parquet( s3_base, s3so, + kvikio_remote_io, pdf, bytes_per_thread, columns, diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index d636f36f282..aecb7ae7c5c 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -16,6 +16,7 @@ import pandas as pd from fsspec.core import expand_paths_if_needed, get_fs_token_paths +import cudf from cudf.api.types import is_list_like from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial @@ -1624,6 +1625,16 @@ def _maybe_expand_directories(paths, glob_pattern, fs): return expanded_paths +def _use_kvikio_remote_io(fs) -> bool: + """Whether `kvikio_remote_io` is enabled and `fs` refers to a S3 file""" + + try: + from s3fs.core import S3FileSystem + except ImportError: + return False + return cudf.get_option("kvikio_remote_io") and isinstance(fs, S3FileSystem) + + @doc_get_reader_filepath_or_buffer() def get_reader_filepath_or_buffer( path_or_data, @@ -1649,17 +1660,17 @@ def get_reader_filepath_or_buffer( ) ] if not input_sources: - raise ValueError("Empty input source list: {input_sources}.") + raise ValueError(f"Empty input source list: {input_sources}.") filepaths_or_buffers = [] string_paths = [isinstance(source, str) for source in input_sources] if any(string_paths): - # Sources are all strings. Thes strings are typically + # Sources are all strings. The strings are typically # file paths, but they may also be raw text strings. # Don't allow a mix of source types if not all(string_paths): - raise ValueError("Invalid input source list: {input_sources}.") + raise ValueError(f"Invalid input source list: {input_sources}.") # Make sure we define a filesystem (if possible) paths = input_sources @@ -1712,11 +1723,17 @@ def get_reader_filepath_or_buffer( raise FileNotFoundError( f"{input_sources} could not be resolved to any files" ) - filepaths_or_buffers = _prefetch_remote_buffers( - paths, - fs, - **(prefetch_options or {}), - ) + + # If `kvikio_remote_io` is enabled and `fs` refers to a S3 file, + # we create S3 URLs and let them pass-through to libcudf. + if _use_kvikio_remote_io(fs): + filepaths_or_buffers = [f"s3://{fpath}" for fpath in paths] + else: + filepaths_or_buffers = _prefetch_remote_buffers( + paths, + fs, + **(prefetch_options or {}), + ) else: raw_text_input = True diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 967d05e7057..c129903f8f1 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -20,6 +20,7 @@ import codecs import errno import io import os +import re from pylibcudf.libcudf.io.json import \ json_recovery_mode_t as JSONRecoveryMode # no-cython-lint @@ -147,6 +148,8 @@ cdef class SourceInfo: Mixing different types of sources will raise a `ValueError`. """ + # Regular expression that match remote file paths supported by libcudf + _is_remote_file_pattern = re.compile(r"^s3://", re.IGNORECASE) def __init__(self, list sources): if not sources: @@ -161,11 +164,10 @@ cdef class SourceInfo: for src in sources: if not isinstance(src, (os.PathLike, str)): raise ValueError("All sources must be of the same type!") - if not os.path.isfile(src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - + if not (os.path.isfile(src) or self._is_remote_file_pattern.match(src)): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), src + ) c_files.push_back( str(src).encode()) self.c_obj = move(source_info(c_files)) From c209daeb10dad9b153e0fbcde873c304951ff158 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Nov 2024 08:52:24 -0800 Subject: [PATCH 04/40] Add io.text APIs to pylibcudf (#17232) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17232 --- .../api_docs/pylibcudf/io/index.rst | 1 + .../user_guide/api_docs/pylibcudf/io/text.rst | 6 + python/cudf/cudf/_lib/text.pyx | 82 +++----- python/pylibcudf/pylibcudf/io/CMakeLists.txt | 2 +- python/pylibcudf/pylibcudf/io/__init__.pxd | 2 +- python/pylibcudf/pylibcudf/io/__init__.py | 2 +- python/pylibcudf/pylibcudf/io/text.pxd | 30 +++ python/pylibcudf/pylibcudf/io/text.pyx | 193 ++++++++++++++++++ .../pylibcudf/pylibcudf/tests/io/test_text.py | 29 +++ 9 files changed, 285 insertions(+), 62 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst create mode 100644 python/pylibcudf/pylibcudf/io/text.pxd create mode 100644 python/pylibcudf/pylibcudf/io/text.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_text.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index 53638f071cc..cd5c5a5f77e 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,4 +19,5 @@ I/O Functions csv json parquet + text timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst new file mode 100644 index 00000000000..327ca043f36 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst @@ -0,0 +1,6 @@ +==== +text +==== + +.. automodule:: pylibcudf.io.text + :members: diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index b2c7232f549..7942d067c2b 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,33 +1,20 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from io import TextIOBase +from libcpp cimport bool -from cython.operator cimport dereference -from libc.stdint cimport uint64_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move +from io import TextIOBase -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.io.text cimport ( - byte_range_info, - data_chunk_source, - make_source, - make_source_from_bgzip_file, - make_source_from_file, - multibyte_split, - parse_options, -) +import pylibcudf as plc from cudf._lib.column cimport Column def read_text(object filepaths_or_buffers, - object delimiter=None, - object byte_range=None, - object strip_delimiters=False, - object compression=None, - object compression_offsets=None): + str delimiter, + object byte_range, + bool strip_delimiters, + object compression, + object compression_offsets): """ Cython function to call into libcudf API, see `multibyte_split`. @@ -35,24 +22,11 @@ def read_text(object filepaths_or_buffers, -------- cudf.io.text.read_text """ - cdef string delim = delimiter.encode() - - cdef unique_ptr[data_chunk_source] datasource - cdef unique_ptr[column] c_col - - cdef size_t c_byte_range_offset - cdef size_t c_byte_range_size - cdef uint64_t c_compression_begin_offset - cdef uint64_t c_compression_end_offset - cdef parse_options c_options - if compression is None: if isinstance(filepaths_or_buffers, TextIOBase): - datasource = move(make_source( - filepaths_or_buffers.read().encode())) + datasource = plc.io.text.make_source(filepaths_or_buffers.read()) else: - datasource = move(make_source_from_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) elif compression == "bgzip": if isinstance(filepaths_or_buffers, TextIOBase): raise ValueError("bgzip compression requires a file path") @@ -60,30 +34,20 @@ def read_text(object filepaths_or_buffers, if len(compression_offsets) != 2: raise ValueError( "compression offsets need to consist of two elements") - c_compression_begin_offset = compression_offsets[0] - c_compression_end_offset = compression_offsets[1] - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode(), - c_compression_begin_offset, - c_compression_end_offset)) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + compression_offsets[0], + compression_offsets[1] + ) else: - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + ) else: raise ValueError("Only bgzip compression is supported at the moment") - c_options = parse_options() - if byte_range is not None: - c_byte_range_offset = byte_range[0] - c_byte_range_size = byte_range[1] - c_options.byte_range = byte_range_info( - c_byte_range_offset, - c_byte_range_size) - c_options.strip_delimiters = strip_delimiters - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_options)) - - return Column.from_unique_ptr(move(c_col)) + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters + ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index 965724a47b1..f78d97ef4d1 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx - types.pyx + text.pyx types.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 1bcc0a3f963..6ba7f78a013 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, timezone, types +from . cimport avro, datasource, json, orc, parquet, timezone, text, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 2e4f215b12c..0fc77dd0f57 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, timezone, types +from . import avro, csv, datasource, json, orc, parquet, timezone, text, types from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd new file mode 100644 index 00000000000..051e9bc0cde --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source + +cdef class ParseOptions: + cdef parse_options c_options + +cdef class DataChunkSource: + cdef unique_ptr[data_chunk_source] c_source + cdef string data_ref + + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=* +) + +cpdef DataChunkSource make_source(str data) + +cpdef DataChunkSource make_source_from_file(str filename) + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=*, + int virtual_end=*, +) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx new file mode 100644 index 00000000000..667a054baaa --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -0,0 +1,193 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.io cimport text as cpp_text + +cdef class ParseOptions: + """ + Parsing options for `multibyte_split` + + Parameters + ---------- + byte_range : list | tuple, default None + Only rows starting inside this byte range will be + part of the output column. + + strip_delimiters : bool, default True + Whether delimiters at the end of rows should + be stripped from the output column. + """ + def __init__( + self, + *, + byte_range=None, + strip_delimiters=False, + ): + self.c_options = cpp_text.parse_options() + if byte_range is not None: + c_byte_range_offset = byte_range[0] + c_byte_range_size = byte_range[1] + self.c_options.byte_range = cpp_text.byte_range_info( + c_byte_range_offset, + c_byte_range_size + ) + self.c_options.strip_delimiters = strip_delimiters + + +cdef class DataChunkSource: + """ + Data source for `multibyte_split` + + Parameters + ---------- + data : str + Filename or data itself. + """ + + def __cinit__(self, str data): + # Need to keep a reference alive for make_source + self.data_ref = data.encode() + + +cpdef DataChunkSource make_source(str data): + """ + Creates a data source capable of producing device-buffered views + of the given string. + + Parameters + ---------- + data : str + The host data to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided host data. + """ + cdef DataChunkSource dcs = DataChunkSource(data) + with nogil: + dcs.c_source = move(cpp_text.make_source(dcs.data_ref)) + return dcs + + +cpdef DataChunkSource make_source_from_file(str filename): + """ + Creates a data source capable of producing device-buffered views of the file. + + Parameters + ---------- + filename : str + The filename of the file to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef DataChunkSource dcs = DataChunkSource(filename) + with nogil: + dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref)) + return dcs + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=-1, + int virtual_end=-1, +): + """ + Creates a data source capable of producing device-buffered views of + a BGZIP compressed file with virtual record offsets. + + Parameters + ---------- + filename : str + The filename of the BGZIP-compressed file to be exposed as a data chunk source. + + virtual_begin : int + The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits + describe the offset into the compressed file, its lower 16 bits describe the + block-local offset. + + virtual_end : int, default None + The virtual (Tabix) offset one past the last byte to be read + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef uint64_t c_virtual_begin + cdef uint64_t c_virtual_end + cdef DataChunkSource dcs = DataChunkSource(filename) + + if virtual_begin == -1 and virtual_end == -1: + with nogil: + dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref)) + elif virtual_begin != -1 and virtual_end != -1: + c_virtual_begin = virtual_begin + c_virtual_end = virtual_end + with nogil: + dcs.c_source = move( + cpp_text.make_source_from_bgzip_file( + dcs.data_ref, + c_virtual_begin, + c_virtual_end, + ) + ) + else: + raise ValueError( + "virtual_begin and virtual_end must both be None or both be int" + ) + return dcs + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=None +): + """ + Splits the source text into a strings column using a multiple byte delimiter. + + For details, see :cpp:func:`cudf::io::text::multibyte_split` + + Parameters + ---------- + source : + The source string. + + delimiter : str + UTF-8 encoded string for which to find offsets in the source. + + options : ParseOptions + The parsing options to use (including byte range). + + Returns + ------- + Column + The strings found by splitting the source by the delimiter + within the relevant byte range. + """ + cdef unique_ptr[column] c_result + cdef unique_ptr[data_chunk_source] c_source = move(source.c_source) + cdef string c_delimiter = delimiter.encode() + + if options is None: + options = ParseOptions() + + cdef cpp_text.parse_options c_options = options.c_options + + with nogil: + c_result = cpp_text.multibyte_split( + dereference(c_source), + c_delimiter, + c_options + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py new file mode 100644 index 00000000000..f69e940e34e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize( + "source_func", + [ + "make_source", + "make_source_from_file", + ], +) +@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()]) +def test_multibyte_split(source_func, options, tmp_path): + data = "x::y::z" + func = getattr(plc.io.text, source_func) + if source_func == "make_source": + source = func(data) + elif source_func == "make_source_from_file": + fle = tmp_path / "fle.txt" + fle.write_text(data) + source = func(str(fle)) + result = plc.io.text.multibyte_split(source, "::", options) + expected = pa.array(["x::", "y::", "z"]) + assert_column_eq(result, expected) From 2db58d58b4a986c2c6fad457f291afb1609fd458 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 7 Nov 2024 11:02:07 -0600 Subject: [PATCH 05/40] Add support for `pyarrow-18` (#17256) This PR unpins the max `pyarrow` version allowed to `18`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17256 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- dependencies.yaml | 6 +++--- python/cudf/pyproject.toml | 4 ++-- python/pylibcudf/pyproject.toml | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8a64ebf40c5..6fbdd4ba568 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -69,7 +69,7 @@ dependencies: - polars>=1.11,<1.13 - pre-commit - ptxcompiler -- pyarrow>=14.0.0,<18.0.0a0 +- pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 - pytest-benchmark - pytest-cases>=3.8.2 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 5f779c3170f..4aafa12fdae 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandoc - polars>=1.11,<1.13 - pre-commit -- pyarrow>=14.0.0,<18.0.0a0 +- pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 - pynvjitlink>=0.0.0a0 - pytest-benchmark diff --git a/dependencies.yaml b/dependencies.yaml index 4c6aefe996f..93213172445 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -442,7 +442,7 @@ dependencies: common: - output_types: [conda] packages: - - pyarrow>=14.0.0,<18.0.0a0 + - pyarrow>=14.0.0,<19.0.0a0 - output_types: [requirements, pyproject] packages: # pyarrow 17.0.0 wheels have a subtle issue around threading that @@ -450,8 +450,8 @@ dependencies: # be highly dependent on the exact build configuration, so we'll just # avoid 17.0.0 for now unless we observe similar issues in future # releases as well. - - pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64' - - pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64' + - pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64' + - pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64' cuda_version: specific: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 1eadceaaccd..41dedc4ff20 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -30,8 +30,8 @@ dependencies = [ "packaging", "pandas>=2.0,<2.2.4dev0", "ptxcompiler", - "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'", + "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'", "pylibcudf==24.12.*,>=0.0.0a0", "rich", "rmm==24.12.*,>=0.0.0a0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index b2cec80f484..ac3018b9333 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -22,8 +22,8 @@ dependencies = [ "libcudf==24.12.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", - "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'", + "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'", "rmm==24.12.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 5147882eb99445a803d1f4acb6a718a6a88001d6 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:24:58 -0500 Subject: [PATCH 06/40] Process parquet bools with microkernels (#17157) This adds support for the bool type to reading parquet microkernels. Both plain (bit-packed) and RLE-encoded bool decode is supported, using separate code paths. This PR also massively reduces boilerplate code, as most of the template info needed is already encoded in the kernel mask. Also the superfluous level_t template parameter on rle_run has been removed. And bools have been added to the parquet benchmarks. Performance: register count drops from 62 -> 56, both plain and RLE-encoded bool decoding are now 46% faster (uncompressed). Reading sample customer data shows no change. NDS tests show no change. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) Approvers: - Yunsong Wang (https://github.com/PointKernel) - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17157 --- cpp/benchmarks/io/nvbench_helpers.hpp | 2 + .../io/parquet/parquet_reader_input.cpp | 2 + .../io/parquet/parquet_reader_options.cpp | 3 +- cpp/benchmarks/io/parquet/parquet_writer.cpp | 3 + cpp/src/io/parquet/decode_fixed.cu | 425 +++++++----------- cpp/src/io/parquet/decode_preprocess.cu | 4 +- cpp/src/io/parquet/page_hdr.cu | 6 +- cpp/src/io/parquet/page_string_decode.cu | 4 +- cpp/src/io/parquet/parquet_gpu.hpp | 77 +--- cpp/src/io/parquet/reader_impl.cpp | 119 ++--- cpp/src/io/parquet/rle_stream.cuh | 9 +- 11 files changed, 230 insertions(+), 424 deletions(-) diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp index 1e3ab2b7b4f..cc548ccd3de 100644 --- a/cpp/benchmarks/io/nvbench_helpers.hpp +++ b/cpp/benchmarks/io/nvbench_helpers.hpp @@ -28,6 +28,7 @@ enum class data_type : int32_t { INTEGRAL = static_cast(type_group_id::INTEGRAL), INTEGRAL_SIGNED = static_cast(type_group_id::INTEGRAL_SIGNED), FLOAT = static_cast(type_group_id::FLOATING_POINT), + BOOL8 = static_cast(cudf::type_id::BOOL8), DECIMAL = static_cast(type_group_id::FIXED_POINT), TIMESTAMP = static_cast(type_group_id::TIMESTAMP), DURATION = static_cast(type_group_id::DURATION), @@ -44,6 +45,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( case data_type::INTEGRAL: return "INTEGRAL"; case data_type::INTEGRAL_SIGNED: return "INTEGRAL_SIGNED"; case data_type::FLOAT: return "FLOAT"; + case data_type::BOOL8: return "BOOL8"; case data_type::DECIMAL: return "DECIMAL"; case data_type::TIMESTAMP: return "TIMESTAMP"; case data_type::DURATION: return "DURATION"; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index ce115fd7723..b14f9cbb67e 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -114,6 +114,7 @@ void BM_parquet_read_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -298,6 +299,7 @@ void BM_parquet_read_wide_tables_mixed(nvbench::state& state) using d_type_list = nvbench::enum_type_list(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 256e50f0e64..84e4b8b93c0 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -89,6 +89,7 @@ void BM_parq_write_io_compression( { auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -143,6 +144,7 @@ void BM_parq_write_varying_options( auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -181,6 +183,7 @@ void BM_parq_write_varying_options( using d_type_list = nvbench::enum_type_list(dst)); } else if (dtype == INT96) { gpuOutputInt96Timestamp(s, sb, src_pos, static_cast(dst)); } else if (dtype_len == 8) { @@ -841,6 +843,33 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) return run_val != s->col.max_level[lvl]; } +template +inline __device__ void bool_plain_decode(page_state_s* s, state_buf* sb, int t, int to_decode) +{ + int pos = s->dict_pos; + int const target_pos = pos + to_decode; + + while (pos < target_pos) { + int const batch_len = min(target_pos - pos, decode_block_size_t); + + if (t < batch_len) { + int const bit_pos = pos + t; + int const byte_offset = bit_pos >> 3; + int const bit_in_byte_index = bit_pos & 7; + + uint8_t const* const read_from = s->data_start + byte_offset; + bool const read_bit = (*read_from) & (1 << bit_in_byte_index); + + int const write_to_index = rolling_index(bit_pos); + sb->dict_idx[write_to_index] = read_bit; + } + + pos += batch_len; + } + + if (t == 0) { s->dict_pos = pos; } +} + template __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) { @@ -872,14 +901,7 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) * @param num_rows Maximum number of rows to read * @param error_code Error code to set if an error is encountered */ -template - typename DecodeValuesFunc> +template CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) gpuDecodePageDataGeneric(PageInfo* pages, device_span chunks, @@ -887,12 +909,33 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) size_t num_rows, kernel_error::pointer error_code) { + constexpr bool has_dict_t = (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST); + constexpr bool has_bools_t = (kernel_mask_t == decode_kernel_mask::BOOLEAN) || + (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) || + (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST); + constexpr bool has_nesting_t = + (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED); + constexpr bool has_lists_t = + (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); + constexpr bool split_decode_t = + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); + constexpr int rolling_buf_size = decode_block_size_t * 2; constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); __shared__ __align__(16) page_state_s state_g; using state_buf_t = page_state_buffers_s; __shared__ __align__(16) state_buf_t state_buffers; @@ -920,32 +963,31 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. if (s->num_rows == 0) { return; } - DecodeValuesFunc decode_values; + using value_decoder_type = std::conditional_t< + split_decode_t, + decode_fixed_width_split_values_func, + decode_fixed_width_values_func>; + value_decoder_type decode_values; bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here - constexpr int shared_rep_size = - has_lists_t - ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) - : 0; - constexpr int shared_dict_size = - has_dict_t - ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) - : 0; - constexpr int shared_def_size = - cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); - constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size; + constexpr int rle_run_buffer_bytes = + cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); + constexpr int shared_buf_size = + rle_run_buffer_bytes * (static_cast(has_dict_t) + static_cast(has_bools_t) + + static_cast(has_lists_t) + 1); __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers - int shared_offset = 0; - rle_run* rep_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_lists_t) { shared_offset += shared_rep_size; } - - rle_run* dict_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_dict_t) { shared_offset += shared_dict_size; } - rle_run* def_runs = reinterpret_cast*>(shared_buf + shared_offset); + int shared_offset = 0; + auto rep_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_lists_t) { shared_offset += rle_run_buffer_bytes; } + auto dict_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_dict_t) { shared_offset += rle_run_buffer_bytes; } + auto bool_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_bools_t) { shared_offset += rle_run_buffer_bytes; } + auto def_runs = reinterpret_cast(shared_buf + shared_offset); // initialize the stream decoders (requires values computed in setupLocalPageInfo) rle_stream def_decoder{def_runs}; @@ -974,6 +1016,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); } + // Use dictionary stream memory for bools + rle_stream bool_stream{bool_runs}; + bool bools_are_rle_stream = (s->dict_run == 0); + if constexpr (has_bools_t) { + if (bools_are_rle_stream) { + bool_stream.init(1, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); + } + } + __syncthreads(); + // We use two counters in the loop below: processed_count and valid_count. // - processed_count: number of values out of num_input_values that we have decoded so far. // the definition stream returns the number of total rows it has processed in each call @@ -1041,13 +1093,20 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) } __syncthreads(); - // if we have dictionary data + // if we have dictionary or bool data + // We want to limit the number of dictionary/bool items we decode, that correspond to + // the rows we have processed in this iteration that are valid. + // We know the number of valid rows to process with: next_valid_count - valid_count. if constexpr (has_dict_t) { - // We want to limit the number of dictionary items we decode, that correspond to - // the rows we have processed in this iteration that are valid. - // We know the number of valid rows to process with: next_valid_count - valid_count. dict_stream.decode_next(t, next_valid_count - valid_count); __syncthreads(); + } else if constexpr (has_bools_t) { + if (bools_are_rle_stream) { + bool_stream.decode_next(t, next_valid_count - valid_count); + } else { + bool_plain_decode(s, sb, t, next_valid_count - valid_count); + } + __syncthreads(); } // decode the values themselves @@ -1061,250 +1120,82 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) } // anonymous namespace -void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) -{ - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); - dim3 dim_grid(pages.size(), 1); // 1 threadblock per page - - if (level_type_size == 1) { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } -} +template +using kernel_tag_t = std::integral_constant; -void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) -{ - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block - dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks - - if (level_type_size == 1) { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } -} +template +using int_tag_t = std::integral_constant; -void __host__ -DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) +void __host__ DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + decode_kernel_mask kernel_mask, + kernel_error::pointer error_code, + rmm::cuda_stream_view stream) { - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block - dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks - - if (level_type_size == 1) { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric + // No template parameters on lambdas until C++20, so use type tags instead + auto launch_kernel = [&](auto block_size_tag, auto kernel_mask_tag) { + constexpr int decode_block_size = decltype(block_size_tag)::value; + constexpr decode_kernel_mask mask = decltype(kernel_mask_tag)::value; + + dim3 dim_block(decode_block_size, 1); + dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + + if (level_type_size == 1) { + gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } else { - gpuDecodePageDataGeneric + gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } + }; + + switch (kernel_mask) { + case decode_kernel_mask::FIXED_WIDTH_NO_DICT: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT_NESTED: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN_NESTED: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + default: CUDF_EXPECTS(false, "Kernel type not handled by this function"); break; } } diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 62f1ee88036..5b9831668e6 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -343,8 +343,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; // the level stream decoders - __shared__ rle_run def_runs[rle_run_buffer_size]; - __shared__ rle_run rep_runs[rle_run_buffer_size]; + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 52d53cb8225..a8a8c441a84 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -181,9 +181,13 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, } else if (is_string_col(chunk)) { // check for string before byte_stream_split so FLBA will go to the right kernel return decode_kernel_mask::STRING; + } else if (is_boolean(chunk)) { + return is_list(chunk) ? decode_kernel_mask::BOOLEAN_LIST + : is_nested(chunk) ? decode_kernel_mask::BOOLEAN_NESTED + : decode_kernel_mask::BOOLEAN; } - if (!is_byte_array(chunk) && !is_boolean(chunk)) { + if (!is_byte_array(chunk)) { if (page.encoding == Encoding::PLAIN) { return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index ca74a1c2ba0..5ece3a54892 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -618,8 +618,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBo constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); // the level stream decoders - __shared__ rle_run def_runs[rle_run_buffer_size]; - __shared__ rle_run rep_runs[rle_run_buffer_size]; + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index dba24b553e6..3b4d0e6dc80 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -224,6 +224,9 @@ enum class decode_kernel_mask { FIXED_WIDTH_NO_DICT_LIST = (1 << 13), // Run decode kernel for fixed width non-dictionary pages BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = (1 << 14), // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists + BOOLEAN = (1 << 15), // Run decode kernel for boolean data + BOOLEAN_NESTED = (1 << 16), // Run decode kernel for nested boolean data + BOOLEAN_LIST = (1 << 17), // Run decode kernel for list boolean data }; // mask representing all the ways in which a string can be encoded @@ -539,7 +542,7 @@ enum class encode_kernel_mask { DELTA_BINARY = (1 << 2), // Run DELTA_BINARY_PACKED encoding kernel DELTA_LENGTH_BA = (1 << 3), // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel DELTA_BYTE_ARRAY = (1 << 4), // Run DELTA_BYtE_ARRAY encoding kernel - BYTE_STREAM_SPLIT = (1 << 5), // Run plain encoding kernel, but split streams + BYTE_STREAM_SPLIT = (1 << 5) // Run plain encoding kernel, but split streams }; /** @@ -911,72 +914,18 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[in] is_list Whether or not the data contains list data. + * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodePageDataFixed(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); - -/** - * @brief Launches kernel for reading dictionary fixed width column data stored in the pages - * - * The page data will be written to the output pointed to in the page's - * associated column chunk. - * - * @param[in,out] pages All pages to be decoded - * @param[in] chunks All chunks to be decoded - * @param[in] num_rows Total number of rows to read - * @param[in] min_row Minimum number of rows to read - * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[in] is_list Whether or not the data contains list data. - * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use - */ -void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); - -/** - * @brief Launches kernel for reading fixed width column data stored in the pages - * - * The page data will be written to the output pointed to in the page's - * associated column chunk. - * - * @param[in,out] pages All pages to be decoded - * @param[in] chunks All chunks to be decoded - * @param[in] num_rows Total number of rows to read - * @param[in] min_row Minimum number of rows to read - * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[in] is_list Whether or not the data contains list data. - * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use - */ -void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); +void DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + decode_kernel_mask kernel_mask, + kernel_error::pointer error_code, + rmm::cuda_stream_view stream); /** * @brief Launches kernel for initializing encoder row group fragments diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 689386b8957..cfbb88cd80e 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -219,8 +219,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num int const nkernels = std::bitset<32>(kernel_mask).count(); auto streams = cudf::detail::fork_streams(_stream, nkernels); - // launch string decoder int s_idx = 0; + + auto decode_data = [&](decode_kernel_mask decoder_mask) { + DecodePageData(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + decoder_mask, + error_code.data(), + streams[s_idx++]); + }; + + // launch string decoder if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) { DecodeStringPageData(subpass.pages, pass.chunks, @@ -266,41 +278,17 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // launch byte stream split decoder if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT); } // launch byte stream split decoder, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED); } // launch byte stream split decoder, for list columns if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); } // launch byte stream split decoder @@ -316,80 +304,47 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // launch fixed width type decoder if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT); } // launch fixed width type decoder for lists if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST); } // launch fixed width type decoder, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED); + } + + // launch boolean type decoder + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN) != 0) { + decode_data(decode_kernel_mask::BOOLEAN); + } + + // launch boolean type decoder, for nested columns + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_NESTED) != 0) { + decode_data(decode_kernel_mask::BOOLEAN_NESTED); + } + + // launch boolean type decoder, for nested columns + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_LIST) != 0) { + decode_data(decode_kernel_mask::BOOLEAN_LIST); } // launch fixed width type decoder with dictionaries if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT); } // launch fixed width type decoder with dictionaries for lists if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_LIST); } // launch fixed width type decoder with dictionaries, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_NESTED); } // launch the catch-all page decoder diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 69e783a89d0..3c49de0c997 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -152,7 +152,6 @@ __device__ inline void decode(level_t* const output, } // a single rle run. may be broken up into multiple rle_batches -template struct rle_run { int size; // total size of the run int output_pos; // absolute position of this run w.r.t output @@ -183,14 +182,14 @@ struct rle_stream { level_t* output; - rle_run* runs; + rle_run* runs; int output_pos; int fill_index; int decode_index; - __device__ rle_stream(rle_run* _runs) : runs(_runs) {} + __device__ rle_stream(rle_run* _runs) : runs(_runs) {} __device__ inline bool is_last_decode_warp(int warp_id) { @@ -217,7 +216,7 @@ struct rle_stream { decode_index = -1; // signals the first iteration. Nothing to decode. } - __device__ inline int get_rle_run_info(rle_run& run) + __device__ inline int get_rle_run_info(rle_run& run) { run.start = cur; run.level_run = get_vlq32(run.start, end); @@ -384,7 +383,7 @@ struct rle_stream { // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for // the first time while (cur < end) { - rle_run run; + rle_run run; int run_bytes = get_rle_run_info(run); if ((output_pos + run.size) > target_count) { From 64c72fc022e5d5e2d687e6a93a3ab96fb6ef78c3 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:03:33 -0500 Subject: [PATCH 07/40] Move strings to date/time types benchmarks to nvbench (#17229) Moves the `cpp/benchmarks/string/convert_datetime.cpp` and `cpp/benchmarks/string/convert_duration.cpp` benchmark implementations from google-bench to nvbench. Authors: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17229 --- cpp/benchmarks/CMakeLists.txt | 13 +-- cpp/benchmarks/string/convert_datetime.cpp | 87 +++++++------- cpp/benchmarks/string/convert_durations.cpp | 122 ++++++++------------ 3 files changed, 91 insertions(+), 131 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index f6a5c97e059..ad090be99f3 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -355,15 +355,8 @@ ConfigureNVBench( # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- ConfigureBench( - STRINGS_BENCH - string/convert_datetime.cpp - string/convert_durations.cpp - string/factory.cu - string/filter.cpp - string/repeat_strings.cpp - string/replace.cpp - string/translate.cpp - string/url_decode.cu + STRINGS_BENCH string/factory.cu string/filter.cpp string/repeat_strings.cpp string/replace.cpp + string/translate.cpp string/url_decode.cu ) ConfigureNVBench( @@ -372,6 +365,8 @@ ConfigureNVBench( string/char_types.cpp string/combine.cpp string/contains.cpp + string/convert_datetime.cpp + string/convert_durations.cpp string/convert_fixed_point.cpp string/convert_numerics.cpp string/copy.cpp diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp index 5deca3664b7..288aa6029d3 100644 --- a/cpp/benchmarks/string/convert_datetime.cpp +++ b/cpp/benchmarks/string/convert_datetime.cpp @@ -16,62 +16,59 @@ #include #include -#include -#include #include #include +#include #include -class StringDateTime : public cudf::benchmark {}; +#include -enum class direction { to, from }; +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_D, "cudf::timestamp_D", "cudf::timestamp_D"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_s, "cudf::timestamp_s", "cudf::timestamp_s"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_us, "cudf::timestamp_us", "cudf::timestamp_us"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ns, "cudf::timestamp_ns", "cudf::timestamp_ns"); -template -void BM_convert_datetime(benchmark::State& state, direction dir) +using Types = nvbench::type_list; + +template +void bench_convert_datetime(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); - auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const from_ts = state.get_string("dir") == "from"; - auto const column = create_random_column(data_type.id(), row_count{n_rows}); - cudf::column_view input(column->view()); + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const ts_col = create_random_column(data_type.id(), row_count{num_rows}); - auto source = dir == direction::to ? cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S") - : make_empty_column(cudf::data_type{cudf::type_id::STRING}); - cudf::strings_column_view source_string(source->view()); + auto format = std::string{"%Y-%m-%d %H:%M:%S"}; + auto s_col = cudf::strings::from_timestamps(ts_col->view(), format); + auto sv = cudf::strings_column_view(s_col->view()); - for (auto _ : state) { - cuda_event_timer raii(state, true); - if (dir == direction::to) - cudf::strings::to_timestamps(source_string, data_type, "%Y-%m-%d %H:%M:%S"); - else - cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S"); - } + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream()) - : n_rows * sizeof(TypeParam); - state.SetBytesProcessed(state.iterations() * bytes); + if (from_ts) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(sv.chars_size(stream)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::from_timestamps(ts_col->view(), format); + }); + } else { + state.add_global_memory_reads(sv.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::to_timestamps(sv, data_type, format); + }); + } } -#define STR_BENCHMARK_DEFINE(name, type, dir) \ - BENCHMARK_DEFINE_F(StringDateTime, name)(::benchmark::State & state) \ - { \ - BM_convert_datetime(state, dir); \ - } \ - BENCHMARK_REGISTER_F(StringDateTime, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -STR_BENCHMARK_DEFINE(from_days, cudf::timestamp_D, direction::from); -STR_BENCHMARK_DEFINE(from_seconds, cudf::timestamp_s, direction::from); -STR_BENCHMARK_DEFINE(from_mseconds, cudf::timestamp_ms, direction::from); -STR_BENCHMARK_DEFINE(from_useconds, cudf::timestamp_us, direction::from); -STR_BENCHMARK_DEFINE(from_nseconds, cudf::timestamp_ns, direction::from); - -STR_BENCHMARK_DEFINE(to_days, cudf::timestamp_D, direction::to); -STR_BENCHMARK_DEFINE(to_seconds, cudf::timestamp_s, direction::to); -STR_BENCHMARK_DEFINE(to_mseconds, cudf::timestamp_ms, direction::to); -STR_BENCHMARK_DEFINE(to_useconds, cudf::timestamp_us, direction::to); -STR_BENCHMARK_DEFINE(to_nseconds, cudf::timestamp_ns, direction::to); +NVBENCH_BENCH_TYPES(bench_convert_datetime, NVBENCH_TYPE_AXES(Types)) + .set_name("datetime") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22}); diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp index f12d292c2e7..9d2377f2d82 100644 --- a/cpp/benchmarks/string/convert_durations.cpp +++ b/cpp/benchmarks/string/convert_durations.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,92 +14,60 @@ * limitations under the License. */ +#include #include -#include - -#include #include #include -#include +#include #include -#include -#include - -class DurationsToString : public cudf::benchmark {}; -template -void BM_convert_from_durations(benchmark::State& state) -{ - cudf::size_type const source_size = state.range(0); - - // Every element is valid - auto data = cudf::detail::make_counting_transform_iterator( - 0, [source_size](auto i) { return TypeParam{i - source_size / 2}; }); +#include - cudf::test::fixed_width_column_wrapper source_durations(data, data + source_size); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_D, "cudf::duration_D", "cudf::duration_D"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_s, "cudf::duration_s", "cudf::duration_s"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ms, "cudf::duration_ms", "cudf::duration_ms"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_us, "cudf::duration_us", "cudf::duration_us"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ns, "cudf::duration_ns", "cudf::duration_ns"); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::strings::from_durations(source_durations, "%D days %H:%M:%S"); - } - - state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam)); -} +using Types = nvbench::type_list; -class StringToDurations : public cudf::benchmark {}; -template -void BM_convert_to_durations(benchmark::State& state) +template +void bench_convert_duration(nvbench::state& state, nvbench::type_list) { - cudf::size_type const source_size = state.range(0); - - // Every element is valid - auto data = cudf::detail::make_counting_transform_iterator( - 0, [source_size](auto i) { return TypeParam{i - source_size / 2}; }); - - cudf::test::fixed_width_column_wrapper source_durations(data, data + source_size); - auto results = cudf::strings::from_durations(source_durations, "%D days %H:%M:%S"); - cudf::strings_column_view source_string(*results); - auto output_type = cudf::data_type(cudf::type_to_id()); - - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::strings::to_durations(source_string, output_type, "%D days %H:%M:%S"); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const from_dur = state.get_string("dir") == "from"; + + auto const ts_col = create_random_column(data_type.id(), row_count{num_rows}); + cudf::column_view input(ts_col->view()); + + auto format = std::string{"%D days %H:%M:%S"}; + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (from_dur) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(format.size() * num_rows); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::from_durations(input, format); }); + } else { + auto source = cudf::strings::from_durations(input, format); + auto view = cudf::strings_column_view(source->view()); + state.add_global_memory_reads(view.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::to_durations(view, data_type, format); + }); } - - state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam)); } -#define DSBM_BENCHMARK_DEFINE(name, type) \ - BENCHMARK_DEFINE_F(DurationsToString, name)(::benchmark::State & state) \ - { \ - BM_convert_from_durations(state); \ - } \ - BENCHMARK_REGISTER_F(DurationsToString, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -#define SDBM_BENCHMARK_DEFINE(name, type) \ - BENCHMARK_DEFINE_F(StringToDurations, name)(::benchmark::State & state) \ - { \ - BM_convert_to_durations(state); \ - } \ - BENCHMARK_REGISTER_F(StringToDurations, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -DSBM_BENCHMARK_DEFINE(from_durations_D, cudf::duration_D); -DSBM_BENCHMARK_DEFINE(from_durations_s, cudf::duration_s); -DSBM_BENCHMARK_DEFINE(from_durations_ms, cudf::duration_ms); -DSBM_BENCHMARK_DEFINE(from_durations_us, cudf::duration_us); -DSBM_BENCHMARK_DEFINE(from_durations_ns, cudf::duration_ns); - -SDBM_BENCHMARK_DEFINE(to_durations_D, cudf::duration_D); -SDBM_BENCHMARK_DEFINE(to_durations_s, cudf::duration_s); -SDBM_BENCHMARK_DEFINE(to_durations_ms, cudf::duration_ms); -SDBM_BENCHMARK_DEFINE(to_durations_us, cudf::duration_us); -SDBM_BENCHMARK_DEFINE(to_durations_ns, cudf::duration_ns); +NVBENCH_BENCH_TYPES(bench_convert_duration, NVBENCH_TYPE_AXES(Types)) + .set_name("duration") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 10, 1 << 15, 1 << 20, 1 << 25}); From 773aefc3d63aa354f64e4b60794297e8ef64fcba Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:24:41 -0500 Subject: [PATCH 08/40] Use `pylibcudf.strings.convert.convert_integers.is_integer` in cudf python (#17270) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17270 --- .../_lib/strings/convert/convert_integers.pyx | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx index 8b6da2bfa1c..50113347ccb 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx @@ -1,15 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - is_integer as cpp_is_integer, -) +import pylibcudf as plc from cudf._lib.column cimport Column @@ -20,12 +13,8 @@ def is_integer(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have integers. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_integer( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_integers.is_integer( + source_strings.to_pylibcudf(mode="read") + ) + ) From c73defdf704d067c86ee5fce6c43b4f707d382b2 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:20:08 -0500 Subject: [PATCH 09/40] Use pylibcudf.search APIs in cudf python (#17271) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17271 --- python/cudf/cudf/_lib/sort.pyx | 58 +++++++--------------------------- 1 file changed, 11 insertions(+), 47 deletions(-) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 185552ede82..eefe37d9880 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -5,21 +5,10 @@ from itertools import repeat from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector from pylibcudf.libcudf.aggregation cimport rank_method -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.search cimport lower_bound, upper_bound -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport null_order, order as cpp_order - from cudf._lib.column cimport Column -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table import pylibcudf @@ -311,44 +300,19 @@ def digitize(list source_columns, list bins, bool right=False): right : Indicating whether the intervals include the right or the left bin edge. """ - - cdef table_view bins_view = table_view_from_columns(bins) - cdef table_view source_table_view = table_view_from_columns( - source_columns - ) - cdef vector[cpp_order] column_order = ( - vector[cpp_order]( - bins_view.num_columns(), - cpp_order.ASCENDING - ) - ) - cdef vector[null_order] null_precedence = ( - vector[null_order]( - bins_view.num_columns(), - null_order.BEFORE + return Column.from_pylibcudf( + getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")( + pylibcudf.Table( + [c.to_pylibcudf(mode="read") for c in bins] + ), + pylibcudf.Table( + [c.to_pylibcudf(mode="read") for c in source_columns] + ), + [pylibcudf.types.Order.ASCENDING]*len(bins), + [pylibcudf.types.NullOrder.BEFORE]*len(bins) ) ) - cdef unique_ptr[column] c_result - if right: - with nogil: - c_result = move(lower_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - else: - with nogil: - c_result = move(upper_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def rank_columns(list source_columns, rank_method method, str na_option, From e52ce858ce216c7ee2e02f5256418fdae955b2a4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:42:27 -0800 Subject: [PATCH 10/40] Mark column chunks in a PQ reader `pass` as large strings when the cumulative `offsets` exceeds the large strings threshold. (#17207) This PR implements a method to correctly set the large-string property for column chunks in a in the Chunked Parquet Reader subpass if the cumulative string offsets have exceeded the large strings threshold. Fixes #17158 Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17207 --- cpp/src/io/parquet/reader_impl.cpp | 31 ++++++-- cpp/src/io/parquet/reader_impl_chunking.hpp | 3 + cpp/src/io/utilities/column_buffer.cpp | 4 +- cpp/src/io/utilities/column_buffer.hpp | 6 +- cpp/src/io/utilities/column_buffer_strings.cu | 3 +- cpp/tests/large_strings/parquet_tests.cpp | 74 +++++++++++++++++++ 6 files changed, 111 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index cfbb88cd80e..d74ae83b635 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -97,22 +97,37 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num _stream); } + // Compute column string sizes (using page string offsets) for this subpass col_string_sizes = calculate_page_string_offsets(); - // check for overflow + // ensure cumulative column string sizes have been initialized + if (pass.cumulative_col_string_sizes.empty()) { + pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); + } + + // Add to the cumulative column string sizes of this pass + std::transform(pass.cumulative_col_string_sizes.begin(), + pass.cumulative_col_string_sizes.end(), + col_string_sizes.begin(), + pass.cumulative_col_string_sizes.begin(), + std::plus<>{}); + + // Check for overflow in cumulative column string sizes of this pass so that the page string + // offsets of overflowing (large) string columns are treated as 64-bit. auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), - col_string_sizes.cend(), + auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), + pass.cumulative_col_string_sizes.cend(), [=](std::size_t sz) { return sz > threshold; }); if (has_large_strings and not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } - // mark any chunks that are large string columns + // Mark any chunks for which the cumulative column string size has exceeded the + // large strings threshold if (has_large_strings) { for (auto& chunk : pass.chunks) { auto const idx = chunk.src_col_index; - if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } + if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } } } } @@ -195,7 +210,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and col_string_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], _stream); + out_buf.create_string_data( + col_string_sizes[pass.chunks[c].src_col_index], + pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > + static_cast(strings::detail::get_offset64_threshold()), + _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index a0c2dbd3e44..ca46f198bb8 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -130,6 +130,9 @@ struct pass_intermediate_data { rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; + // cumulative strings column sizes. + std::vector cumulative_col_string_sizes{}; + int level_type_size{0}; // skip_rows / num_rows for this pass. diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 6d954753af8..41ed55cd090 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -63,9 +63,11 @@ void cudf::io::detail::inline_column_buffer::allocate_strings_data(bool memset_d } void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes, + bool is_large_strings_col, rmm::cuda_stream_view stream) { - _string_data = rmm::device_buffer(num_bytes, stream, _mr); + _is_large_strings_col = is_large_strings_col; + _string_data = rmm::device_buffer(num_bytes, stream, _mr); } namespace { diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index 31c8b781e77..da19539f509 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -246,13 +246,17 @@ class inline_column_buffer : public column_buffer_base { [[nodiscard]] size_t data_size_impl() const { return _data.size(); } std::unique_ptr make_string_column_impl(rmm::cuda_stream_view stream); - void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream); + void create_string_data(size_t num_bytes, + bool is_large_strings_col, + rmm::cuda_stream_view stream); void* string_data() { return _string_data.data(); } [[nodiscard]] void const* string_data() const { return _string_data.data(); } [[nodiscard]] size_t string_size() const { return _string_data.size(); } + [[nodiscard]] bool is_large_strings_column() const { return _is_large_strings_col; } private: rmm::device_buffer _string_data{}; + bool _is_large_strings_col{}; }; using column_buffer = gather_column_buffer; diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu index 4bc303a34a5..66d0a644c12 100644 --- a/cpp/src/io/utilities/column_buffer_strings.cu +++ b/cpp/src/io/utilities/column_buffer_strings.cu @@ -27,8 +27,7 @@ std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_colu { // if the size of _string_data is over the threshold for 64bit size_type, _data will contain // sizes rather than offsets. need special handling for that case. - auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - if (_string_data.size() > threshold) { + if (is_large_strings_column()) { if (not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp index f47782a2d02..39cd783de00 100644 --- a/cpp/tests/large_strings/parquet_tests.cpp +++ b/cpp/tests/large_strings/parquet_tests.cpp @@ -18,6 +18,7 @@ #include +#include #include #include #include @@ -69,3 +70,76 @@ TEST_F(ParquetStringsTest, ReadLargeStrings) // go back to normal threshold unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); } + +// Disabled as the test is too brittle and depends on empirically set `pass_read_limit`, +// encoding type, and the currently used `ZSTD` scratch space size. +TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings) +{ + // Construct a table with one large strings column > 2GB + auto const wide = this->wide_column(); + auto input = cudf::concatenate(std::vector(120000, wide)); ///< 230MB + + int constexpr multiplier = 12; + std::vector input_cols(multiplier, input->view()); + auto col0 = cudf::concatenate(input_cols); ///< 2.70GB + + // Expected table + auto const expected = cudf::table_view{{col0->view()}}; + auto expected_metadata = cudf::io::table_input_metadata{expected}; + + // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with + // regualar ones. This may change in the future and lead to false failures. + expected_metadata.column_metadata[0].set_encoding( + cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY); + + // Host buffer to write Parquet + std::vector buffer; + + // Writer options + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected) + .metadata(expected_metadata); + + // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with + // regualar ones. This may change in the future and lead to false failures. + out_opts.set_compression(cudf::io::compression_type::ZSTD); + + // Write to Parquet + cudf::io::write_parquet(out_opts); + + // Empirically set pass_read_limit of 8GB so we read almost entire table (>2GB strings) in the + // first subpass and only a small amount in the second subpass. This may change in the future + // and lead to false failures. + size_t constexpr pass_read_limit = size_t{8} * 1024 * 1024 * 1024; + + // Reader options + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size())); + + // Chunked parquet reader + auto reader = cudf::io::chunked_parquet_reader(0, pass_read_limit, default_in_opts); + + // Read chunked + auto tables = std::vector>{}; + while (reader.has_next()) { + tables.emplace_back(reader.read_chunk().tbl); + } + auto table_views = std::vector{}; + std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) { + return tbl->view(); + }); + auto result = cudf::concatenate(table_views); + auto const result_view = result->view(); + + // Verify offsets + for (auto const& cv : result_view) { + auto const offsets = cudf::strings_column_view(cv).offsets(); + EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64}); + } + + // Verify tables to be equal + CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected); + + // Verify that we read exactly two table chunks + EXPECT_EQ(tables.size(), 2); +} From b3b5ce94a576bd19967e41ef6c82ff94342e7b80 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 7 Nov 2024 18:22:10 -0600 Subject: [PATCH 11/40] Add optional column_order in JSON reader (#17029) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds optional column order to enforce column order in the output. This feature is required by spark from_json. Optional `column_order` is added to `schema_element`, and it is validated during reader_option creation. The column order can be specified at root level and for any struct in any level. • For root level, the dtypes should be schema_element with type STRUCT. (schema_element is added to variant dtypes) • For nested level, column_order can be specified for any STRUCT type. (could be a map of schema_element , or schema_element) If the column order is not specified, the order of columns is same as the order of columns that appear in json file. Closes #17240 (metadata updated) Closes #17091 (will return all nulls column if not present in input json) Closes #17090 (fixed with new schema_element as dtype) Closes #16799 (output columns are created from column_order if present) Authors: - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17029 --- cpp/include/cudf/io/json.hpp | 53 ++++- cpp/src/io/json/host_tree_algorithms.cu | 28 ++- cpp/src/io/json/json_column.cu | 114 +++++++--- cpp/src/io/json/nested_json.hpp | 23 ++ cpp/src/io/json/nested_json_gpu.cu | 4 +- cpp/src/io/json/parser_features.cpp | 192 ++++++++++++++++ cpp/tests/io/json/json_test.cpp | 283 +++++++++++++++++++++++- 7 files changed, 637 insertions(+), 60 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index b662b660557..7cd4697f592 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -18,6 +18,7 @@ #include "types.hpp" +#include #include #include #include @@ -53,6 +54,11 @@ struct schema_element { * @brief Allows specifying this column's child columns target type */ std::map child_types; + + /** + * @brief Allows specifying the order of the columns + */ + std::optional> column_order; }; /** @@ -87,13 +93,18 @@ enum class json_recovery_mode_t { * | `chunksize` | use `byte_range_xxx` for chunking instead | */ class json_reader_options { + public: + using dtype_variant = + std::variant, + std::map, + std::map, + schema_element>; ///< Variant type holding dtypes information for the columns + + private: source_info _source; // Data types of the column; empty to infer dtypes - std::variant, - std::map, - std::map> - _dtypes; + dtype_variant _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -178,13 +189,7 @@ class json_reader_options { * * @returns Data types of the columns */ - [[nodiscard]] std::variant, - std::map, - std::map> const& - get_dtypes() const - { - return _dtypes; - } + [[nodiscard]] dtype_variant const& get_dtypes() const { return _dtypes; } /** * @brief Returns compression format of the source. @@ -228,7 +233,11 @@ class json_reader_options { */ [[nodiscard]] size_t get_byte_range_padding() const { - auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = + std::visit(cudf::detail::visitor_overload{ + [](auto const& dtypes) { return dtypes.size(); }, + [](schema_element const& dtypes) { return dtypes.child_types.size(); }}, + _dtypes); auto const max_row_bytes = 16 * 1024; // 16KB auto const column_bytes = 64; @@ -390,6 +399,14 @@ class json_reader_options { */ void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** + * @brief Set data types for a potentially nested column hierarchy. + * + * @param types schema element with column names and column order to support arbitrary nesting of + * data types + */ + void set_dtypes(schema_element types); + /** * @brief Set the compression type. * @@ -624,6 +641,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set data types for columns to be read. + * + * @param types Struct schema_element with Column name -> schema_element with map and order + * @return this for chaining + */ + json_reader_options_builder& dtypes(schema_element types) + { + options.set_dtypes(std::move(types)); + return *this; + } + /** * @brief Set the compression type. * diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 570a00cbfc2..7fafa885c66 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -269,7 +269,8 @@ std::map unified_schema(cudf::io::json_reader_optio }); return dnew; }, - [](std::map const& user_dtypes) { return user_dtypes; }}, + [](std::map const& user_dtypes) { return user_dtypes; }, + [](schema_element const& user_dtypes) { return user_dtypes.child_types; }}, options.get_dtypes()); } @@ -492,7 +493,7 @@ std::pair, hashmap_of_device_columns> build_tree auto expected_types = cudf::detail::make_host_vector(num_columns, stream); std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); - auto lookup_names = [&column_names](auto child_ids, auto name) { + auto lookup_names = [&column_names](auto const& child_ids, auto const& name) { for (auto const& child_id : child_ids) { if (column_names[child_id] == name) return child_id; } @@ -569,7 +570,7 @@ std::pair, hashmap_of_device_columns> build_tree for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size(); i++) { NodeIndexT const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; auto value_id = std::stol(name); if (value_id >= 0 and value_id < static_cast(user_dtypes.size())) mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]}); @@ -580,7 +581,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)}); } @@ -589,10 +590,19 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, user_dtypes.at(name)); } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + schema_element const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto const& name = column_names[first_child_id]; + if (user_dtypes.child_types.count(name) != 0) + mark_is_pruned(first_child_id, user_dtypes.child_types.at(name)); + } }}, options.get_dtypes()); } else { @@ -626,7 +636,9 @@ std::pair, hashmap_of_device_columns> build_tree [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( std::map const& user_dtypes) -> void { mark_is_pruned(root_struct_col_id, u_schema); - }}, + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](schema_element const& user_dtypes) + -> void { mark_is_pruned(root_struct_col_id, u_schema); }}, options.get_dtypes()); } // Useful for array of arrays @@ -714,7 +726,7 @@ std::pair, hashmap_of_device_columns> build_tree if (expected_category == NC_STRUCT) { // find field column ids, and its children and create columns. for (auto const& field_id : child_ids) { - auto name = column_names[field_id]; + auto const& name = column_names[field_id]; if (is_pruned[field_id]) continue; auto inserted = ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; @@ -745,7 +757,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map> array_values; for (auto const& child_id : child_ids) { if (is_pruned[child_id]) continue; - auto name = column_names[child_id]; + auto const& name = column_names[child_id]; array_values[std::stoi(name)].push_back(child_id); } // diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 7e4d975e431..30a154fdda2 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -399,9 +399,9 @@ std::pair, std::vector> device_json_co // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema return {std::move(col), std::vector{}}; @@ -410,12 +410,37 @@ std::pair, std::vector> device_json_co std::vector> child_columns; std::vector column_names{}; size_type num_rows{json_col.num_rows}; + + bool const has_column_order = + prune_columns and not schema.value_or(schema_element{}) + .column_order.value_or(std::vector{}) + .empty(); + + auto const& col_order = + has_column_order ? schema.value().column_order.value() : json_col.column_order; + // Create children columns - for (auto const& col_name : json_col.column_order) { - auto const& col = json_col.child_columns.find(col_name); - column_names.emplace_back(col->first); - auto& child_col = col->second; + for (auto const& col_name : col_order) { auto child_schema_element = get_child_schema(col_name); + auto const found_it = json_col.child_columns.find(col_name); + + if (prune_columns and found_it == std::end(json_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + column_names.emplace_back(make_column_name_info( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), col_name)); + auto all_null_column = make_all_nulls_column( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), + num_rows, + stream, + mr); + child_columns.emplace_back(std::move(all_null_column)); + continue; + } + column_names.emplace_back(found_it->first); + + auto& child_col = found_it->second; if (!prune_columns or child_schema_element.has_value()) { auto [child_column, names] = device_json_column_to_cudf_column( child_col, d_input, options, prune_columns, child_schema_element, stream, mr); @@ -576,11 +601,21 @@ table_with_metadata device_parse_nested_json(device_span d_input, std::vector out_column_names; auto parse_opt = parsing_options(options, stream); - // Iterate over the struct's child columns and convert to cudf column - size_type column_index = 0; - for (auto const& col_name : root_struct_col.column_order) { - auto& json_col = root_struct_col.child_columns.find(col_name)->second; + schema_element const* prune_schema = std::get_if(&options.get_dtypes()); + bool const has_column_order = options.is_enabled_prune_columns() and prune_schema != nullptr and + prune_schema->column_order.has_value() and + not prune_schema->column_order->empty(); + auto const& col_order = + has_column_order ? prune_schema->column_order.value() : root_struct_col.column_order; + if (has_column_order) { + CUDF_EXPECTS(prune_schema->child_types.size() == col_order.size(), + "Input schema column order size mismatch with input schema child types"); + } + auto root_col_size = root_struct_col.num_rows; + // Iterate over the struct's child columns/column_order and convert to cudf column + size_type column_index = 0; + for (auto const& col_name : col_order) { std::optional child_schema_element = std::visit( cudf::detail::visitor_overload{ [column_index](std::vector const& user_dtypes) -> std::optional { @@ -590,17 +625,23 @@ table_with_metadata device_parse_nested_json(device_span d_input, }, [col_name]( std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) + return std::optional{{it->second}}; + return std::nullopt; }, [col_name](std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) return it->second; + return std::nullopt; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + if (auto it = user_dtypes.child_types.find(col_name); + it != std::end(user_dtypes.child_types)) + return it->second; + return std::nullopt; }}, options.get_dtypes()); + #ifdef NJP_DEBUG_PRINT auto debug_schema_print = [](auto ret) { std::cout << ", type id: " @@ -608,20 +649,39 @@ table_with_metadata device_parse_nested_json(device_span d_input, << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" << "\n"; }; - std::visit( - cudf::detail::visitor_overload{[column_index](std::vector const&) { - std::cout << "Column by index: #" << column_index; - }, - [col_name](std::map const&) { - std::cout << "Column by flat name: '" << col_name; - }, - [col_name](std::map const&) { - std::cout << "Column by nested name: #" << col_name; - }}, - options.get_dtypes()); + std::visit(cudf::detail::visitor_overload{ + [column_index](std::vector const&) { + std::cout << "Column by index: #" << column_index; + }, + [col_name](std::map const&) { + std::cout << "Column by flat name: '" << col_name; + }, + [col_name](std::map const&) { + std::cout << "Column by nested name: #" << col_name; + }, + [col_name](schema_element const&) { + std::cout << "Column by nested schema with column order: #" << col_name; + }}, + options.get_dtypes()); debug_schema_print(child_schema_element); #endif + auto const found_it = root_struct_col.child_columns.find(col_name); + if (options.is_enabled_prune_columns() and + found_it == std::end(root_struct_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + // inserts all null column + out_column_names.emplace_back(make_column_name_info(child_schema_element.value(), col_name)); + auto all_null_column = + make_all_nulls_column(child_schema_element.value(), root_col_size, stream, mr); + out_columns.emplace_back(std::move(all_null_column)); + column_index++; + continue; + } + auto& json_col = found_it->second; + if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) { // Get this JSON column's cudf column and schema info, (modifies json_col) auto [cudf_col, col_name_info] = diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 7b3b04dea16..4989fff4b30 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -429,6 +429,29 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Create all null column of a given nested schema + * + * @param schema The schema of the column to create + * @param num_rows The number of rows in the column + * @param stream The CUDA stream to which kernels are dispatched + * @param mr resource with which to allocate + * @return The all null column + */ +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * @brief Create metadata for a column of a given schema + * + * @param schema The schema of the column + * @param col_name The name of the column + * @return column metadata for a given schema + */ +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); + /** * @brief Get the path data type of a column by path if present in input schema * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 60e78f4763d..f1c2826c62a 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2198,9 +2198,9 @@ std::pair, std::vector> json_column_to // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema else { diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 4caa5cd9e24..401a6e992de 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -16,14 +16,201 @@ #include "nested_json.hpp" +#include +#include +#include #include +#include +#include +#include +#include +#include #include #include #include +namespace cudf::io { +namespace { +bool validate_column_order(schema_element const& types) +{ + // For struct types, check if column_order size matches child_types size and all elements in + // column_order are in child_types, in child_types, call this function recursively. + // For list types, check if child_types size is 1 and call this function recursively. + if (types.type.id() == type_id::STRUCT) { + if (types.column_order.has_value()) { + if (types.column_order.value().size() != types.child_types.size()) { return false; } + for (auto const& column_name : types.column_order.value()) { + auto it = types.child_types.find(column_name); + if (it == types.child_types.end()) { return false; } + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + } + } else if (types.type.id() == type_id::LIST) { + if (types.child_types.size() != 1) { return false; } + auto it = types.child_types.begin(); + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + return true; +} +} // namespace + +void json_reader_options::set_dtypes(schema_element types) +{ + CUDF_EXPECTS( + validate_column_order(types), "Column order does not match child types", std::invalid_argument); + _dtypes = std::move(types); +} +} // namespace cudf::io + namespace cudf::io::json::detail { +/// Created an empty column of the specified schema +struct empty_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + template ())> + std::unique_ptr operator()(schema_element const& schema) const + { + return make_empty_column(schema.type); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)); + auto offsets = make_empty_column(data_type(type_to_id())); + return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name))); + } + return make_structs_column(0, std::move(child_columns), 0, {}, stream, mr); + } +}; + +/// Created all null column of the specified schema +struct allnull_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + private: + auto make_zeroed_offsets(size_type size) const + { + auto offsets_buff = + cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); + return std::make_unique(std::move(offsets_buff), rmm::device_buffer{}, 0); + } + + public: + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + } + + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + auto indices = make_zeroed_offsets(size - 1); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_dictionary_column( + std::move(child), std::move(indices), std::move(null_mask), size, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_strings_column( + size, std::move(offsets), rmm::device_buffer{}, size, std::move(null_mask)); + } + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_lists_column( + size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size)); + } + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_structs_column( + size, std::move(child_columns), size, std::move(null_mask), stream, mr); + } +}; + +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::type_dispatcher(schema.type, allnull_column_functor{stream, mr}, schema, num_rows); +} + +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name) +{ + column_name_info info; + info.name = col_name; + switch (schema.type.id()) { + case type_id::STRUCT: + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + info.children.push_back( + make_column_name_info(schema.child_types.at(child_name), child_name)); + } + break; + case type_id::LIST: + info.children.emplace_back("offsets"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::DICTIONARY32: + info.children.emplace_back("indices"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::STRING: info.children.emplace_back("offsets"); break; + default: break; + } + return info; +} + std::optional child_schema_element(std::string const& col_name, cudf::io::json_reader_options const& options) { @@ -46,6 +233,11 @@ std::optional child_schema_element(std::string const& col_name, return (user_dtypes.find(col_name) != std::end(user_dtypes)) ? user_dtypes.find(col_name)->second : std::optional{}; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) + ? user_dtypes.child_types.find(col_name)->second + : std::optional{}; }}, options.get_dtypes()); } diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index b58ca56e066..199b0092473 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -239,7 +239,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest(), scale}}) + .dtypes(std::vector{data_type{type_to_id(), scale}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -324,7 +324,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -348,7 +348,8 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) - .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) + .dtypes(std::map{ + {"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -466,7 +467,7 @@ TEST_P(JsonReaderParamTest, Booleans) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -508,7 +509,7 @@ TEST_P(JsonReaderParamTest, Dates) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) + .dtypes(std::vector{data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) .dayfirst(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -564,7 +565,7 @@ TEST_P(JsonReaderParamTest, Durations) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) + .dtypes(std::vector{data_type{type_id::DURATION_NANOSECONDS}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -1022,7 +1023,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -1461,7 +1462,7 @@ TEST_F(JsonReaderTest, ErrorStrings) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{cudf::type_id::STRING}}) + .dtypes(std::vector{data_type{cudf::type_id::STRING}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -1849,7 +1850,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{type_to_id(), 0}}) + .dtypes(std::vector{data_type{type_to_id(), 0}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -2827,7 +2828,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2865,7 +2866,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2991,4 +2992,264 @@ TEST_F(JsonReaderTest, LastRecordInvalid) CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}}); } +// Test case for dtype pruning with column order +TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .prune_columns(true) + .lines(lines); + + // include all columns + //// schema with partial ordering + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"0", "1"}}}}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"b", "a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + EXPECT_EQ(result.metadata.schema_info[1].name, "a"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // "b" children checks + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[1].name, "element"); + // types + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(1).type().id(), cudf::type_id::FLOAT32); + } + //// schema with pruned columns and different order. + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + { + data_type{cudf::type_id::STRUCT}, + }}, + {"a", {dtype()}}, + }, + {{"c", "b", "a"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // "c", "b" and "a" order + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "c"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "a"); + // pruned + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 0); + } + //// schema with pruned columns and different sub-order. + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + {data_type{cudf::type_id::STRUCT}, + // {}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1", "0"}}}}, + {"a", {dtype()}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Order of occurance in json + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // Sub-order of "b" + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "1"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "0"); + } + //// schema with 1 dtype, but 2 column order + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// repetition, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "a"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// different column name in order, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Column name not found in input schema map, but present in column order and + // prune_columns is enabled + } + // include only one column (nested) + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1"}}}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "b":"1":[float] + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element"); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32); + } + // multiple - all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + // multiple - not all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"d", {dtype()}}, + }, + {{"a", "d"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_bools = + cudf::test::fixed_width_column_wrapper{{true, true, true, true}, {0, 0, 0, 0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_bools); + } + // test struct, list of string, list of struct. + // multiple - not all present nested + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + { + {"2", {data_type{cudf::type_id::STRING}}}, + }, + {{"2"}}}}, + {"d", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}, + {"e", + {data_type{cudf::type_id::LIST}, + {{"element", + { + data_type{cudf::type_id::STRUCT}, + { + {"3", {data_type{cudf::type_id::STRING}}}, + }, //{{"3"}} missing column_order, but output should not have it. + }}}}}, + }, + {{"b", "d", "e"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "b" (empty struct) and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + ASSERT_EQ(result.metadata.schema_info[0].children[0].name, "2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_strings = cudf::test::strings_column_wrapper{{"", "", "", ""}, {0, 0, 0, 0}}; + EXPECT_EQ(result.tbl->get_column(0).num_children(), 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(0), all_null_strings); + auto const all_null_list = cudf::test::lists_column_wrapper{ + {{0, 0}, {1, 1}, {2, 2}, {3, 3}}, cudf::test::iterators::all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_list); + EXPECT_EQ(result.metadata.schema_info[2].name, "e"); + ASSERT_EQ(result.metadata.schema_info[2].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[2].children[1].children.size(), 0); + // ASSERT_EQ(result.metadata.schema_info[2].children[1].children[0].name, "3"); + auto empty_string_col = cudf::test::strings_column_wrapper{}; + cudf::test::structs_column_wrapper expected_structs{{}, cudf::test::iterators::all_nulls()}; + // make all null column of list of struct of string + auto wrapped = make_lists_column( + 4, + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}.release(), + expected_structs.release(), + 4, + cudf::create_null_mask(4, cudf::mask_state::ALL_NULL)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped); + } + } +} + CUDF_TEST_PROGRAM_MAIN() From 1777c29840b0d8fce1799cee249fb5d44e7ddf74 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Nov 2024 19:34:38 -0500 Subject: [PATCH 12/40] Allow generating large strings in benchmarks (#17224) Updates the benchmark utility `create_random_utf8_string_column` to support large strings. Replaces the hardcoded `size_type` offsets with the offsetalator and related utilities. Reference #16948 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/17224 --- cpp/benchmarks/common/generate_input.cu | 37 +++++++++++++------------ 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index bdce8a31176..8bce718c7d8 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -540,7 +542,7 @@ struct string_generator { // range 32-127 is ASCII; 127-136 will be multi-byte UTF-8 { } - __device__ void operator()(thrust::tuple str_begin_end) + __device__ void operator()(thrust::tuple str_begin_end) { auto begin = thrust::get<0>(str_begin_end); auto end = thrust::get<1>(str_begin_end); @@ -569,6 +571,9 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons distribution_params{1. - profile.get_null_probability().value_or(0)}); auto lengths = len_dist(engine, num_rows + 1); auto null_mask = valid_dist(engine, num_rows + 1); + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + thrust::transform_if( thrust::device, lengths.begin(), @@ -580,28 +585,26 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons auto valid_lengths = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())), valid_or_zero{}); - rmm::device_uvector offsets(num_rows + 1, cudf::get_default_stream()); - thrust::exclusive_scan( - thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin()); - // offsets are ready. - auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1); + + // offsets are created as INT32 or INT64 as appropriate + auto [offsets, chars_length] = cudf::strings::detail::make_offsets_child_column( + valid_lengths, valid_lengths + num_rows, stream, mr); + // use the offsetalator to normalize the offset values for use by the string_generator + auto offsets_itr = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); rmm::device_uvector chars(chars_length, cudf::get_default_stream()); thrust::for_each_n(thrust::device, - thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1), + thrust::make_zip_iterator(offsets_itr, offsets_itr + 1), num_rows, string_generator{chars.data(), engine}); + auto [result_bitmask, null_count] = - cudf::detail::valid_if(null_mask.begin(), - null_mask.end() - 1, - thrust::identity{}, - cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + profile.get_null_probability().has_value() + ? cudf::detail::valid_if( + null_mask.begin(), null_mask.end() - 1, thrust::identity{}, stream, mr) + : std::pair{rmm::device_buffer{}, 0}; + return cudf::make_strings_column( - num_rows, - std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - chars.release(), - null_count, - profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); + num_rows, std::move(offsets), chars.release(), null_count, std::move(result_bitmask)); } /** From 3c5f787725d0de3b10d5eb1e9fef1fa9b07bc67b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Nov 2024 19:35:18 -0500 Subject: [PATCH 13/40] Fix data_type ctor call in JSON_TEST (#17273) Fixes call to `data_type{}` ctor in `json_test.cpp`. The 2-parameter ctor is for fixed-point-types only and will assert in a debug build if used incorrectly: https://github.com/rapidsai/cudf/blob/2db58d58b4a986c2c6fad457f291afb1609fd458/cpp/include/cudf/types.hpp#L277-L280 Partial stack trace from a gdb run ``` #5 0x000077b1530bc71b in __assert_fail_base (fmt=0x77b153271130 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x58c3e4baaa98 "id == type_id::DECIMAL32 || id == type_id::DECIMAL64 || id == type_id::DECIMAL128", file=0x58c3e4baaa70 "/cudf/cpp/include/cudf/types.hpp", line=279, function=) at ./assert/assert.c:92 #6 0x000077b1530cde96 in __GI___assert_fail ( assertion=0x58c3e4baaa98 "id == type_id::DECIMAL32 || id == type_id::DECIMAL64 || id == type_id::DECIMAL128", file=0x58c3e4baaa70 "/cudf/cpp/include/cudf/types.hpp", line=279, function=0x58c3e4baaa38 "cudf::data_type::data_type(cudf::type_id, int32_t)") at ./assert/assert.c:101 #7 0x000058c3e48ba594 in cudf::data_type::data_type (this=0x7fffdd3f7530, id=cudf::type_id::STRING, scale=0) at /cudf/cpp/include/cudf/types.hpp:279 #8 0x000058c3e49215d9 in JsonReaderTest_MixedTypesWithSchema_Test::TestBody (this=0x58c3e5ea13a0) at /cudf/cpp/tests/io/json/json_test.cpp:2887 ``` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17273 --- cpp/tests/io/json/json_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 199b0092473..26937c9298a 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2885,9 +2885,9 @@ TEST_F(JsonReaderTest, MixedTypesWithSchema) std::map data_types; std::map child_types; child_types.insert( - std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}}); - data_types.insert(std::pair{ - "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}}); + std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING}, {}}}); + data_types.insert( + std::pair{"data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST}, child_types}}); cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) From 18041b5b91234c4fd878497739498f926838bb39 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 7 Nov 2024 22:16:01 -0500 Subject: [PATCH 14/40] Plumb pylibcudf datetime APIs through cudf python (#17275) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17275 --- python/cudf/cudf/_lib/datetime.pyx | 180 +++++++++++------------------ 1 file changed, 70 insertions(+), 110 deletions(-) diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index d844466120f..2c7a585f4b1 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -9,41 +9,29 @@ from libcpp.utility cimport move cimport pylibcudf.libcudf.datetime as libcudf_datetime from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.filling cimport calendrical_month_sequence from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type -from pylibcudf.datetime import DatetimeComponent +from pylibcudf.datetime import DatetimeComponent, RoundingFrequency from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc @acquire_spill_lock() def add_months(Column col, Column months): # months must be int16 dtype - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef column_view months_view = months.view() - - with nogil: - c_result = move( - libcudf_datetime.add_calendrical_months( - col_view, - months_view - ) + return Column.from_pylibcudf( + plc.datetime.add_calendrical_months( + col.to_pylibcudf(mode="read"), + months.to_pylibcudf(mode="read") ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def extract_datetime_component(Column col, object field): - - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.datetime_component component - component_names = { "year": DatetimeComponent.YEAR, "month": DatetimeComponent.MONTH, @@ -57,33 +45,29 @@ def extract_datetime_component(Column col, object field): "nanosecond": DatetimeComponent.NANOSECOND, } if field == "day_of_year": - with nogil: - c_result = move(libcudf_datetime.day_of_year(col_view)) + result = Column.from_pylibcudf( + plc.datetime.day_of_year( + col.to_pylibcudf(mode="read") + ) + ) elif field in component_names: - component = component_names[field] - with nogil: - c_result = move( - libcudf_datetime.extract_datetime_component( - col_view, - component - ) + result = Column.from_pylibcudf( + plc.datetime.extract_datetime_component( + col.to_pylibcudf(mode="read"), + component_names[field], ) + ) + if field == "weekday": + # Pandas counts Monday-Sunday as 0-6 + # while libcudf counts Monday-Sunday as 1-7 + result = result - result.dtype.type(1) else: raise ValueError(f"Invalid field: '{field}'") - result = Column.from_unique_ptr(move(c_result)) - - if field == "weekday": - # Pandas counts Monday-Sunday as 0-6 - # while libcudf counts Monday-Sunday as 1-7 - result = result - result.dtype.type(1) - return result cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): - cdef libcudf_datetime.rounding_frequency freq_val - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html old_to_new_freq_map = { "H": "h", @@ -101,78 +85,60 @@ cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): FutureWarning ) freq = old_to_new_freq_map.get(freq) - if freq == "D": - freq_val = libcudf_datetime.rounding_frequency.DAY - elif freq == "h": - freq_val = libcudf_datetime.rounding_frequency.HOUR - elif freq == "min": - freq_val = libcudf_datetime.rounding_frequency.MINUTE - elif freq == "s": - freq_val = libcudf_datetime.rounding_frequency.SECOND - elif freq == "ms": - freq_val = libcudf_datetime.rounding_frequency.MILLISECOND - elif freq == "us": - freq_val = libcudf_datetime.rounding_frequency.MICROSECOND - elif freq == "ns": - freq_val = libcudf_datetime.rounding_frequency.NANOSECOND + rounding_fequency_map = { + "D": RoundingFrequency.DAY, + "h": RoundingFrequency.HOUR, + "min": RoundingFrequency.MINUTE, + "s": RoundingFrequency.SECOND, + "ms": RoundingFrequency.MILLISECOND, + "us": RoundingFrequency.MICROSECOND, + "ns": RoundingFrequency.NANOSECOND, + } + if freq in rounding_fequency_map: + return rounding_fequency_map[freq] else: raise ValueError(f"Invalid resolution: '{freq}'") - return freq_val @acquire_spill_lock() def ceil_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.ceil_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.ceil_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def floor_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.floor_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.floor_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def round_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.round_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.round_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def is_leap_year(Column col): """Returns a boolean indicator whether the year of the date is a leap year """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.is_leap_year(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.is_leap_year( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() @@ -199,34 +165,28 @@ def extract_quarter(Column col): Returns a column which contains the corresponding quarter of the year for every timestamp inside the input column. """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.extract_quarter(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.extract_quarter( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def days_in_month(Column col): """Extracts the number of days in the month of the date """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.days_in_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.days_in_month( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def last_day_of_month(Column col): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.last_day_of_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.last_day_of_month( + col.to_pylibcudf(mode="read") + ) + ) From 7b80a449514fcd04ceffc5da64522e45512f7324 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 7 Nov 2024 19:43:34 -0800 Subject: [PATCH 15/40] Add IWYU to CI (#17078) This PR adds [`include-what-you-use`](https://github.com/include-what-you-use/include-what-you-use/) to the CI job running clang-tidy. Like clang-tidy, IWYU runs via CMake integration and only runs on cpp files, not cu files. This should help us shrink binaries and reduce compilation times in cases where headers are being included unnecessarily, and it helps keep our include lists clean. The IWYU suggestions for additions are quite noisy and the team determined this to be unnecessary, so this PR instead post-filters the outputs to only show the removals. The final suggestions are uploaded to a file that is uploaded to the GHA page so that it can be downloaded, inspected, and easily applied locally. Resolves #581. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Mark Harris (https://github.com/harrism) - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) - James Lamb (https://github.com/jameslamb) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17078 --- .github/workflows/test.yaml | 3 +- ci/clang_tidy.sh | 29 ------ ci/cpp_linters.sh | 47 +++++++++ cpp/.clang-tidy | 4 +- cpp/CMakeLists.txt | 58 ++++++++--- cpp/scripts/parse_iwyu_output.py | 170 +++++++++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 - dependencies.yaml | 6 ++ 8 files changed, 271 insertions(+), 47 deletions(-) delete mode 100755 ci/clang_tidy.sh create mode 100755 ci/cpp_linters.sh create mode 100644 cpp/scripts/parse_iwyu_output.py diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1275aad757c..ad3f5940b94 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -63,7 +63,8 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - run_script: "ci/clang_tidy.sh" + run_script: "ci/cpp_linters.sh" + file_to_upload: iwyu_results.txt conda-python-cudf-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh deleted file mode 100755 index 4d5d3fc3136..00000000000 --- a/ci/clang_tidy.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024, NVIDIA CORPORATION. - -set -euo pipefail - -rapids-logger "Create clang-tidy conda environment" -. /opt/conda/etc/profile.d/conda.sh - -ENV_YAML_DIR="$(mktemp -d)" - -rapids-dependency-file-generator \ - --output conda \ - --file-key clang_tidy \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" - -rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy - -# Temporarily allow unbound variables for conda activation. -set +u -conda activate clang_tidy -set -u - -RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" - -source rapids-configure-sccache - -# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled. -cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja -cmake --build cpp/build diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh new file mode 100755 index 00000000000..a7c7255456f --- /dev/null +++ b/ci/cpp_linters.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create checks conda environment" +. /opt/conda/etc/profile.d/conda.sh + +ENV_YAML_DIR="$(mktemp -d)" + +rapids-dependency-file-generator \ + --output conda \ + --file-key clang_tidy \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" + +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate clang_tidy +set -u + +RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" + +source rapids-configure-sccache + +# TODO: For testing purposes, clone and build IWYU. We can switch to a release +# once a clang 19-compatible version is available, which should be soon +# (https://github.com/include-what-you-use/include-what-you-use/issues/1641). +git clone --depth 1 https://github.com/include-what-you-use/include-what-you-use.git +pushd include-what-you-use +# IWYU's CMake build uses some Python scripts that assume that the cwd is +# importable, so support that legacy behavior. +export PYTHONPATH=${PWD}:${PYTHONPATH:-} +cmake -S . -B build -GNinja --install-prefix=${CONDA_PREFIX} +cmake --build build +cmake --install build +popd + +# Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled. +cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_STATIC_LINTERS=ON -GNinja +cmake --build cpp/build 2>&1 | python cpp/scripts/parse_iwyu_output.py + +# Remove invalid components of the path for local usage. The path below is +# valid in the CI due to where the project is cloned, but presumably the fixes +# will be applied locally from inside a clone of cudf. +sed -i 's/\/__w\/cudf\/cudf\///' iwyu_results.txt diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index 12120a5c6d1..0e5699876fc 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -39,8 +39,8 @@ Checks: -clang-analyzer-optin.core.EnumCastOutOfRange, -clang-analyzer-optin.cplusplus.UninitializedObject' -WarningsAsErrors: '*' -HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*' +WarningsAsErrors: '' +HeaderFilterRegex: '.*cudf/cpp/(src|include).*' ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none CheckOptions: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bfa4bf80724..d3bf7019e35 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -88,7 +88,7 @@ option( ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL} ) mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) -option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF) +option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") @@ -146,8 +146,10 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR) endif() # ################################################################################################## -# * clang-tidy configuration ---------------------------------------------------------------------- -if(CUDF_CLANG_TIDY) +# * linter configuration --------------------------------------------------------------------------- +if(CUDF_STATIC_LINTERS) + # For simplicity, for now we assume that all linters can be installed into an environment where + # any linter is being run. We could relax this requirement if desired. find_program( CLANG_TIDY_EXE NAMES "clang-tidy" @@ -174,24 +176,48 @@ if(CUDF_CLANG_TIDY) "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}" ) endif() + + find_program(IWYU_EXE NAMES include-what-you-use iwyu REQUIRED) endif() # Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES. -function(enable_clang_tidy target) - set(_tidy_options) +function(enable_static_checkers target) + set(_tidy_options IWYU CLANG_TIDY) set(_tidy_one_value) set(_tidy_multi_value SKIPPED_FILES) cmake_parse_arguments( - _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} + _LINT "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} ) - if(CUDF_CLANG_TIDY) - # clang will complain about unused link libraries on the compile line unless we specify - # -Qunused-arguments. - set_target_properties( - ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" - ) - foreach(file IN LISTS _TIDY_SKIPPED_FILES) + if(CUDF_STATIC_LINTERS) + if(_LINT_CLANG_TIDY) + # clang will complain about unused link libraries on the compile line unless we specify + # -Qunused-arguments. + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + endif() + if(_LINT_IWYU) + # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not + # relevant since they don't show up in any other build so it's better to suppress them until + # we can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not + # appear to be sufficient, we must also ensure that it is set to the underlying target's CXX + # compile flags. To do this completely cleanly we should modify the flags on the target rather + # than the global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run + # the linters on real builds. + foreach(_flag -Wno-missing-braces -Wno-absolute-value -Wunneeded-internal-declaration) + list(FIND CUDF_CXX_FLAGS "${flag}" _flag_index) + if(_flag_index EQUAL -1) + list(APPEND CUDF_CXX_FLAGS ${flag}) + endif() + endforeach() + set(CUDF_CXX_FLAGS + "${CUDF_CXX_FLAGS}" + PARENT_SCOPE + ) + set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}") + endif() + foreach(file IN LISTS _LINT_SKIPPED_FILES) set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON) endforeach() endif() @@ -771,11 +797,15 @@ set_target_properties( INTERFACE_POSITION_INDEPENDENT_CODE ON ) +# Note: This must come before the target_compile_options below so that the function can modify the +# flags if necessary. +enable_static_checkers( + cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp CLANG_TIDY IWYU +) target_compile_options( cudf PRIVATE "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" ) -enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp) if(CUDF_BUILD_STACKTRACE_DEBUG) # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option diff --git a/cpp/scripts/parse_iwyu_output.py b/cpp/scripts/parse_iwyu_output.py new file mode 100644 index 00000000000..822a980a1a8 --- /dev/null +++ b/cpp/scripts/parse_iwyu_output.py @@ -0,0 +1,170 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Helper script to modify IWYU output to only include removals. + +Lines that are not from include-what-you-use are removed from the output. +""" + +import argparse +import re +from enum import Enum + + +class Mode(Enum): + NORMAL = 0 + ADD = 1 + REMOVE = 2 + FULL_INCLUDE_LIST = 3 + + +def extract_include_file(include_line): + """Extract the core file path from an #include directive.""" + match = re.search(r'#include\s+[<"]([^">]+)[">]', include_line) + if match: + return match.group(1) + return None + + +def parse_output(input_stream): + include_modifications = {} + current_file = None + mode = Mode.NORMAL + + for line in input_stream: + if match := re.match(r"(\/\S+) should add these lines:", line): + current_file = match.group(1) + include_modifications.setdefault( + current_file, + { + "add_includes": [], + "remove_includes": [], + "full_include_list": [], + }, + ) + mode = Mode.ADD + elif match := re.match(r"(\/\S+) should remove these lines:", line): + mode = Mode.REMOVE + elif match := re.match(r"The full include-list for (\/\S+):", line): + mode = Mode.FULL_INCLUDE_LIST + elif line.strip() == "---": + current_file = None + mode = Mode.NORMAL + else: + if current_file: + if mode == Mode.ADD: + include_modifications[current_file]["add_includes"].append( + line.strip() + ) + elif mode == Mode.REMOVE: + include_modifications[current_file][ + "remove_includes" + ].append(line.strip()) + elif mode == Mode.FULL_INCLUDE_LIST: + include_modifications[current_file][ + "full_include_list" + ].append(line.strip()) + else: + if ( + line.strip() + and "include-what-you-use reported diagnostics" not in line + and "In file included from" not in line + and "has correct #includes/fwd-decls" not in line + ): + print(line, end="") + + return include_modifications + + +def post_process_includes(include_modifications): + """Deduplicate and remove redundant entries from add and remove includes.""" + for mods in include_modifications.values(): + # Deduplicate add_includes and remove_includes + mods["add_includes"] = list(set(mods["add_includes"])) + mods["remove_includes"] = list(set(mods["remove_includes"])) + + # Extract file paths from add_includes and remove_includes + add_files = { + extract_include_file(line) for line in mods["add_includes"] + } + remove_files = { + extract_include_file(line) for line in mods["remove_includes"] + } + + # Remove entries that exist in both add_includes and remove_includes + common_files = add_files & remove_files + mods["add_includes"] = [ + line + for line in mods["add_includes"] + if extract_include_file(line) not in common_files + ] + mods["remove_includes"] = [ + line + for line in mods["remove_includes"] + if extract_include_file(line) not in common_files + ] + + # Remove entries that exist in add_includes from full_include_list + mods["full_include_list"] = [ + include + for include in mods["full_include_list"] + if extract_include_file(include) not in add_files + ] + + +def write_output(include_modifications, output_stream): + for filename, mods in include_modifications.items(): + if mods["remove_includes"]: + # IWYU requires all sections to exist, so we write out this header even + # though we never write out any actual additions. + output_stream.write(f"{filename} should add these lines:\n\n") + + output_stream.write(f"{filename} should remove these lines:\n") + for line in mods["remove_includes"]: + output_stream.write(line + "\n") + output_stream.write("\n") + + output_stream.write(f"The full include-list for {filename}:\n") + for line in mods["full_include_list"]: + output_stream.write(line + "\n") + output_stream.write("---\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Process include modifications from a build output log." + ) + parser.add_argument( + "input", + nargs="?", + type=argparse.FileType("r"), + default="-", + help="Input file to read (default: stdin)", + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="iwyu_results.txt", + help="Output file to write (default: iwyu_output.txt)", + ) + args = parser.parse_args() + + include_modifications = parse_output(args.input) + post_process_includes(include_modifications) + write_output(include_modifications, args.output) + + +if __name__ == "__main__": + main() diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e9ba58ba224..f502195aea4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -83,7 +83,6 @@ function(ConfigureTest CMAKE_TEST_NAME) "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" ) endif() - enable_clang_tidy(${CMAKE_TEST_NAME}) endfunction() # ################################################################################################## diff --git a/dependencies.yaml b/dependencies.yaml index 93213172445..59f8f2fda49 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -587,6 +587,12 @@ dependencies: packages: - clang==19.1.0 - clang-tools==19.1.0 + # TODO: These are build requirements for IWYU and can be replaced + # with IWYU itself once a conda package of IWYU supporting clang 19 + # is available. + - clangdev==19.1.0 + - llvm==19.1.0 + - llvmdev==19.1.0 docs: common: - output_types: [conda] From e8935b9959dc4e62a1e486fb0359374df0e6e2ea Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Thu, 7 Nov 2024 20:07:12 -0800 Subject: [PATCH 16/40] Rewrite Java API `Table.readJSON` to return the output from libcudf `read_json` directly (#17180) With this PR, `Table.readJSON` will return the output from libcudf `read_json` directly without the need of reordering the columns to match with the input schema, as well as generating all-nulls columns for the ones in the input schema that do not exist in the JSON data. This is because libcudf `read_json` already does these thus we no longer have to do it. Depends on: * https://github.com/rapidsai/cudf/pull/17029 Partially contributes to https://github.com/NVIDIA/spark-rapids/issues/11560. Closes #17002 Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/17180 --- java/src/main/java/ai/rapids/cudf/Table.java | 262 ++----------------- java/src/main/native/src/TableJni.cpp | 32 ++- 2 files changed, 43 insertions(+), 251 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index dbee53640aa..b01ce31b1f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -259,7 +259,6 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter) throws CudfException; @@ -275,7 +274,6 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter, long dsHandle) throws CudfException; @@ -1092,224 +1090,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { return readJSON(schema, opts, buffer, 0, buffer.length); } - private static class DidViewChange { - ColumnVector changeWasNeeded = null; - boolean noChangeNeeded = false; - - public static DidViewChange yes(ColumnVector cv) { - DidViewChange ret = new DidViewChange(); - ret.changeWasNeeded = cv; - return ret; - } - - public static DidViewChange no() { - DidViewChange ret = new DidViewChange(); - ret.noChangeNeeded = true; - return ret; - } - } - - private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.NestedChildren children, - ColumnView cv) { - // We need to do this recursively to be sure it all matches as expected. - // If we run into problems where the data types don't match, we are not - // going to fix up the data types. We are only going to reorder the columns. - if (schema.getType() == DType.STRUCT) { - if (cv.getType() != DType.STRUCT) { - // The types don't match so just return the input unchanged... - return DidViewChange.no(); - } else { - String[] foundNames; - if (children == null) { - foundNames = new String[0]; - } else { - foundNames = children.getNames(); - } - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - String[] neededNames = schema.getColumnNames(); - ColumnView[] columns = new ColumnView[neededNames.length]; - try { - boolean somethingChanged = false; - if (columns.length != foundNames.length) { - somethingChanged = true; - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededNames[i]; - Integer index = indices.get(neededColumnName); - Schema childSchema = schema.getChild(i); - if (index != null) { - if (childSchema.isStructOrHasStructDescendant()) { - ColumnView child = cv.getChildColumnView(index); - boolean shouldCloseChild = true; - try { - if (index != i) { - somethingChanged = true; - } - DidViewChange childResult = gatherJSONColumns(schema.getChild(i), - children.getChild(index), child); - if (childResult.noChangeNeeded) { - shouldCloseChild = false; - columns[i] = child; - } else { - somethingChanged = true; - columns[i] = childResult.changeWasNeeded; - } - } finally { - if (shouldCloseChild) { - child.close(); - } - } - } else { - if (index != i) { - somethingChanged = true; - } - columns[i] = cv.getChildColumnView(index); - } - } else { - somethingChanged = true; - if (types[i] == DType.LIST) { - try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else if (types[i] == DType.STRUCT) { - int numStructChildren = childSchema.getNumChildren(); - HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren]; - for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) { - structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildren)) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } - } - } - if (somethingChanged) { - try (ColumnView ret = new ColumnView(cv.type, cv.rows, Optional.of(cv.nullCount), - cv.getValid(), null, columns)) { - return DidViewChange.yes(ret.copyToColumnVector()); - } - } else { - return DidViewChange.no(); - } - } finally { - for (ColumnView c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } else if (schema.getType() == DType.LIST && cv.getType() == DType.LIST) { - if (schema.isStructOrHasStructDescendant()) { - String [] childNames = children.getNames(); - if (childNames.length == 2 && - "offsets".equals(childNames[0]) && - "element".equals(childNames[1])) { - try (ColumnView child = cv.getChildColumnView(0)){ - DidViewChange listResult = gatherJSONColumns(schema.getChild(0), - children.getChild(1), child); - if (listResult.noChangeNeeded) { - return DidViewChange.no(); - } else { - try (ColumnView listView = new ColumnView(cv.type, cv.rows, - Optional.of(cv.nullCount), cv.getValid(), cv.getOffsets(), - new ColumnView[]{listResult.changeWasNeeded})) { - return DidViewChange.yes(listView.copyToColumnVector()); - } finally { - listResult.changeWasNeeded.close(); - } - } - } - } - } - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } else { - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } - } - - private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) { - String[] neededColumns = schema.getColumnNames(); - if (neededColumns == null || neededColumns.length == 0) { - return twm.releaseTable(); - } else { - String[] foundNames = twm.getColumnNames(); - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - ColumnVector[] columns = new ColumnVector[neededColumns.length]; - try (Table tbl = twm.releaseTable()) { - int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount(); - if (rowCount < 0) { - throw new IllegalStateException( - "No empty row count provided and the table read has no row count or columns"); - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededColumns[i]; - Integer index = indices.get(neededColumnName); - if (index != null) { - if (schema.getChild(i).isStructOrHasStructDescendant()) { - DidViewChange gathered = gatherJSONColumns(schema.getChild(i), twm.getChild(index), - tbl.getColumn(index)); - if (gathered.noChangeNeeded) { - columns[i] = tbl.getColumn(index).incRefCount(); - } else { - columns[i] = gathered.changeWasNeeded; - } - } else { - columns[i] = tbl.getColumn(index).incRefCount(); - } - } else { - if (types[i] == DType.LIST) { - Schema listSchema = schema.getChild(i); - Schema elementSchema = listSchema.getChild(0); - try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else if (types[i] == DType.STRUCT) { - Schema structSchema = schema.getChild(i); - int numStructChildren = structSchema.getNumChildren(); - DataType[] structChildrenTypes = new DataType[numStructChildren]; - for (int j = 0; j < numStructChildren; j++) { - structChildrenTypes[j] = structSchema.getChild(j).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildrenTypes)) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } - } - } - return new Table(columns); - } finally { - for (ColumnVector c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } - /** * Read a JSON file. * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. @@ -1318,10 +1098,6 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1336,11 +1112,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, -1); + return twm.releaseTable(); } } @@ -1361,6 +1136,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1370,6 +1149,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, HostMemoryAllocator hostMemoryAllocator, int emptyRowCount) { @@ -1381,14 +1161,14 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon assert offset >= 0 && offset < buffer.length; try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) { newBuf.setBytes(0, buffer, offset, len); - return readJSON(schema, opts, newBuf, 0, len, emptyRowCount); + return readJSON(schema, opts, newBuf, 0, len); } } + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, int emptyRowCount) { - return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(), - emptyRowCount); + return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, @@ -1470,6 +1250,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1478,6 +1262,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b * @param emptyRowCount the number of rows to use if no columns were found. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, long offset, long len, int emptyRowCount) { if (len <= 0) { @@ -1486,10 +1271,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1505,10 +1286,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } } @@ -1525,18 +1305,19 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param ds the DataSource to read from. * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1550,11 +1331,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter(), dsHandle))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0a667978ca3..1f8b1ea207d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1037,21 +1037,23 @@ cudf::io::schema_element read_schema_element(int& index, if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { std::map child_elems; int num_children = children[index]; + std::vector child_names(num_children); // go to the next entry, so recursion can parse it. index++; for (int i = 0; i < num_children; i++) { - auto const name = std::string{names.get(index).get()}; + auto name = std::string{names.get(index).get()}; child_elems.insert( std::pair{name, cudf::jni::read_schema_element(index, children, names, types, scales)}); + child_names[i] = std::move(name); } - return cudf::io::schema_element{d_type, std::move(child_elems)}; + return cudf::io::schema_element{d_type, std::move(child_elems), {std::move(child_names)}}; } else { if (children[index] != 0) { throw std::invalid_argument("found children for a type that should have none"); } // go to the next entry before returning... index++; - return cudf::io::schema_element{d_type, {}}; + return cudf::io::schema_element{d_type, {}, std::nullopt}; } } @@ -1824,7 +1826,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter, jlong ds_handle) @@ -1853,6 +1854,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1864,7 +1866,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1886,13 +1887,19 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, } std::map data_types; + std::vector name_order; int at = 0; while (at < n_types.size()) { auto const name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.push_back(name); } - opts.dtypes(data_types); + auto const prune_columns = data_types.size() != 0; + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.prune_columns(prune_columns).dtypes(structs); + } else { // should infer the types } @@ -1925,7 +1932,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter) { @@ -1968,6 +1974,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1979,7 +1986,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -2001,13 +2007,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, } std::map data_types; + std::vector name_order; + name_order.reserve(n_types.size()); int at = 0; while (at < n_types.size()) { - auto const name = std::string{n_col_names.get(at).get()}; + auto name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.emplace_back(std::move(name)); } - opts.dtypes(data_types); + auto const prune_columns = data_types.size() != 0; + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.prune_columns(prune_columns).dtypes(structs); } else { // should infer the types } From 150d8d8c3a3a4aec87004fd0d130b56e388fa43d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 8 Nov 2024 08:59:11 +0000 Subject: [PATCH 17/40] Implement inequality joins by translation to conditional joins (#17000) Implement inequality joins by using the newly-exposed conditional join from pylibcudf. - Closes #16926 Authors: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17000 --- python/cudf_polars/cudf_polars/dsl/expr.py | 2 + .../cudf_polars/dsl/expressions/base.py | 35 +++++++- python/cudf_polars/cudf_polars/dsl/ir.py | 70 +++++++++++++++- python/cudf_polars/cudf_polars/dsl/to_ast.py | 79 ++++++++++++++++--- .../cudf_polars/cudf_polars/dsl/translate.py | 68 ++++++---------- .../cudf_polars/cudf_polars/utils/versions.py | 2 + python/cudf_polars/tests/dsl/test_to_ast.py | 38 ++++++++- python/cudf_polars/tests/test_join.py | 16 +++- 8 files changed, 248 insertions(+), 62 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index e748ec16f14..1881286ccbb 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -19,6 +19,7 @@ from cudf_polars.dsl.expressions.base import ( AggInfo, Col, + ColRef, Expr, NamedExpr, ) @@ -40,6 +41,7 @@ "LiteralColumn", "Len", "Col", + "ColRef", "BooleanFunction", "StringFunction", "TemporalFunction", diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py index effe8cb2378..21ba7aea707 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -20,7 +20,7 @@ from cudf_polars.containers import Column, DataFrame -__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext"] +__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"] class AggInfo(NamedTuple): @@ -249,3 +249,36 @@ def do_evaluate( def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" return AggInfo([(self, plc.aggregation.collect_list(), self)]) + + +class ColRef(Expr): + __slots__ = ("index", "table_ref") + _non_child = ("dtype", "index", "table_ref") + index: int + table_ref: plc.expressions.TableReference + + def __init__( + self, + dtype: plc.DataType, + index: int, + table_ref: plc.expressions.TableReference, + column: Expr, + ) -> None: + if not isinstance(column, Col): + raise TypeError("Column reference should only apply to columns") + self.dtype = dtype + self.index = index + self.table_ref = table_ref + self.children = (column,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + raise NotImplementedError( + "Only expect this node as part of an expression translated to libcudf AST." + ) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index a242ff9300f..bc42b4a254f 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -29,8 +29,9 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import Column, DataFrame from cudf_polars.dsl.nodebase import Node -from cudf_polars.dsl.to_ast import to_parquet_filter +from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter from cudf_polars.utils import dtypes +from cudf_polars.utils.versions import POLARS_VERSION_GT_112 if TYPE_CHECKING: from collections.abc import Callable, Hashable, MutableMapping, Sequence @@ -48,6 +49,7 @@ "Select", "GroupBy", "Join", + "ConditionalJoin", "HStack", "Distinct", "Sort", @@ -522,6 +524,12 @@ def do_evaluate( ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index + if POLARS_VERSION_GT_112: + # If we sliced away some data from the start, that + # shifts the row index. + # But prior to 1.13, polars had this wrong, so we match behaviour + # https://github.com/pola-rs/polars/issues/19607 + offset += skip_rows # pragma: no cover; polars 1.13 not yet released dtype = schema[name] step = plc.interop.from_arrow( pa.scalar(1, type=plc.interop.to_arrow(dtype)) @@ -890,6 +898,66 @@ def do_evaluate( return DataFrame(broadcasted).slice(options.slice) +class ConditionalJoin(IR): + """A conditional inner join of two dataframes on a predicate.""" + + __slots__ = ("predicate", "options", "ast_predicate") + _non_child = ("schema", "predicate", "options") + predicate: expr.Expr + options: tuple + + def __init__( + self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR + ) -> None: + self.schema = schema + self.predicate = predicate + self.options = options + self.children = (left, right) + self.ast_predicate = to_ast(predicate) + _, join_nulls, zlice, suffix, coalesce = self.options + # Preconditions from polars + assert not join_nulls + assert not coalesce + if self.ast_predicate is None: + raise NotImplementedError( + f"Conditional join with predicate {predicate}" + ) # pragma: no cover; polars never delivers expressions we can't handle + self._non_child_args = (self.ast_predicate, zlice, suffix) + + @classmethod + def do_evaluate( + cls, + predicate: plc.expressions.Expression, + zlice: tuple[int, int] | None, + suffix: str, + left: DataFrame, + right: DataFrame, + ) -> DataFrame: + """Evaluate and return a dataframe.""" + lg, rg = plc.join.conditional_inner_join(left.table, right.table, predicate) + left = DataFrame.from_table( + plc.copying.gather( + left.table, lg, plc.copying.OutOfBoundsPolicy.DONT_CHECK + ), + left.column_names, + ) + right = DataFrame.from_table( + plc.copying.gather( + right.table, rg, plc.copying.OutOfBoundsPolicy.DONT_CHECK + ), + right.column_names, + ) + right = right.rename_columns( + { + name: f"{name}{suffix}" + for name in right.column_names + if name in left.column_names_set + } + ) + result = left.with_columns(right.columns) + return result.slice(zlice) + + class Join(IR): """A join of two dataframes.""" diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py index 9a0838631cc..acc4b3669af 100644 --- a/python/cudf_polars/cudf_polars/dsl/to_ast.py +++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py @@ -14,12 +14,14 @@ from pylibcudf import expressions as plc_expr from cudf_polars.dsl import expr -from cudf_polars.dsl.traversal import CachingVisitor +from cudf_polars.dsl.traversal import CachingVisitor, reuse_if_unchanged from cudf_polars.typing import GenericTransformer if TYPE_CHECKING: from collections.abc import Mapping + from cudf_polars.typing import ExprTransformer + # Can't merge these op-mapping dictionaries because scoped enum values # are exposed by cython with equality/hash based one their underlying # representation type. So in a dict they are just treated as integers. @@ -128,7 +130,14 @@ def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression: def _(node: expr.Col, self: Transformer) -> plc_expr.Expression: if self.state["for_parquet"]: return plc_expr.ColumnNameReference(node.name) - return plc_expr.ColumnReference(self.state["name_to_index"][node.name]) + raise TypeError("Should always be wrapped in a ColRef node before translation") + + +@_to_ast.register +def _(node: expr.ColRef, self: Transformer) -> plc_expr.Expression: + if self.state["for_parquet"]: + raise TypeError("Not expecting ColRef node in parquet filter") + return plc_expr.ColumnReference(node.index, node.table_ref) @_to_ast.register @@ -238,9 +247,7 @@ def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None: return None -def to_ast( - node: expr.Expr, *, name_to_index: Mapping[str, int] -) -> plc_expr.Expression | None: +def to_ast(node: expr.Expr) -> plc_expr.Expression | None: """ Convert an expression to libcudf AST nodes suitable for compute_column. @@ -248,18 +255,66 @@ def to_ast( ---------- node Expression to convert. - name_to_index - Mapping from column names to their index in the table that - will be used for expression evaluation. + + Notes + ----- + `Col` nodes must always be wrapped in `TableRef` nodes when + converting to an ast expression so that their table reference and + index are provided. Returns ------- - pylibcudf Expressoin if conversion is possible, otherwise None. + pylibcudf Expression if conversion is possible, otherwise None. """ - mapper = CachingVisitor( - _to_ast, state={"for_parquet": False, "name_to_index": name_to_index} - ) + mapper = CachingVisitor(_to_ast, state={"for_parquet": False}) try: return mapper(node) except (KeyError, NotImplementedError): return None + + +def _insert_colrefs(node: expr.Expr, rec: ExprTransformer) -> expr.Expr: + if isinstance(node, expr.Col): + return expr.ColRef( + node.dtype, + rec.state["name_to_index"][node.name], + rec.state["table_ref"], + node, + ) + return reuse_if_unchanged(node, rec) + + +def insert_colrefs( + node: expr.Expr, + *, + table_ref: plc.expressions.TableReference, + name_to_index: Mapping[str, int], +) -> expr.Expr: + """ + Insert column references into an expression before conversion to libcudf AST. + + Parameters + ---------- + node + Expression to insert references into. + table_ref + pylibcudf `TableReference` indicating whether column + references are coming from the left or right table. + name_to_index: + Mapping from column names to column indices in the table + eventually used for evaluation. + + Notes + ----- + All column references are wrapped in the same, singular, table + reference, so this function relies on the expression only + containing column references from a single table. + + Returns + ------- + New expression with column references inserted. + """ + mapper = CachingVisitor( + _insert_colrefs, state={"table_ref": table_ref, "name_to_index": name_to_index} + ) + return mapper(node) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 5181214819e..2711676d31e 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -9,7 +9,7 @@ import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch -from typing import TYPE_CHECKING, Any +from typing import Any import pyarrow as pa from typing_extensions import assert_never @@ -21,13 +21,10 @@ import pylibcudf as plc from cudf_polars.dsl import expr, ir -from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged +from cudf_polars.dsl.to_ast import insert_colrefs from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes, sorting -if TYPE_CHECKING: - from cudf_polars.typing import ExprTransformer - __all__ = ["translate_ir", "translate_named_expr"] @@ -204,55 +201,40 @@ def _( raise NotImplementedError( f"Unsupported join type {how}" ) # pragma: no cover; asof joins not yet exposed - # No exposure of mixed/conditional joins in pylibcudf yet, so in - # the first instance, implement by doing a cross join followed by - # a filter. - _, join_nulls, zlice, suffix, coalesce = node.options - cross = ir.Join( - schema, - [], - [], - ("cross", join_nulls, None, suffix, coalesce), - inp_left, - inp_right, - ) - dtype = plc.DataType(plc.TypeId.BOOL8) if op2 is None: ops = [op1] else: ops = [op1, op2] - suffix = cross.options[3] - - # Column references in the right table refer to the post-join - # names, so with suffixes. - def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr: - if isinstance(e, expr.Col) and e.name in inp_left.schema: - return type(e)(e.dtype, f"{e.name}{suffix}") - return reuse_if_unchanged(e, rec) - - mapper = make_recursive(_rename) - right_on = [ - expr.NamedExpr( - f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new - ) - for new, old in zip( - (mapper(e.value) for e in right_on), right_on, strict=True - ) - ] - mask = functools.reduce( + + dtype = plc.DataType(plc.TypeId.BOOL8) + predicate = functools.reduce( functools.partial( expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND ), ( - expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value) + expr.BinOp( + dtype, + expr.BinOp._MAPPING[op], + insert_colrefs( + left.value, + table_ref=plc.expressions.TableReference.LEFT, + name_to_index={ + name: i for i, name in enumerate(inp_left.schema) + }, + ), + insert_colrefs( + right.value, + table_ref=plc.expressions.TableReference.RIGHT, + name_to_index={ + name: i for i, name in enumerate(inp_right.schema) + }, + ), + ) for op, left, right in zip(ops, left_on, right_on, strict=True) ), ) - filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross) - if zlice is not None: - offset, length = zlice - return ir.Slice(schema, offset, length, filtered) - return filtered + + return ir.ConditionalJoin(schema, predicate, node.options, inp_left, inp_right) @_translate_ir.register diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index a119cab3b74..b08cede8f7f 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -14,6 +14,8 @@ POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11") POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12") +POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12") +POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13") def _ensure_polars_version(): diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py index 57d794d4890..8f10f119199 100644 --- a/python/cudf_polars/tests/dsl/test_to_ast.py +++ b/python/cudf_polars/tests/dsl/test_to_ast.py @@ -3,6 +3,7 @@ from __future__ import annotations +import pyarrow as pa import pytest import polars as pl @@ -10,10 +11,11 @@ import pylibcudf as plc +import cudf_polars.dsl.expr as expr_nodes import cudf_polars.dsl.ir as ir_nodes from cudf_polars import translate_ir from cudf_polars.containers.dataframe import DataFrame, NamedColumn -from cudf_polars.dsl.to_ast import to_ast +from cudf_polars.dsl.to_ast import insert_colrefs, to_ast, to_parquet_filter @pytest.fixture(scope="module") @@ -65,7 +67,14 @@ def test_compute_column(expr, df): name_to_index = {c.name: i for i, c in enumerate(table.columns)} def compute_column(e): - ast = to_ast(e.value, name_to_index=name_to_index) + e_with_colrefs = insert_colrefs( + e.value, + table_ref=plc.expressions.TableReference.LEFT, + name_to_index=name_to_index, + ) + with pytest.raises(NotImplementedError): + e_with_colrefs.evaluate(table) + ast = to_ast(e_with_colrefs) if ast is not None: return NamedColumn( plc.transform.compute_column(table.table, ast), name=e.name @@ -77,3 +86,28 @@ def compute_column(e): expect = q.collect() assert_frame_equal(expect, got) + + +def test_invalid_colref_construction_raises(): + literal = expr_nodes.Literal( + plc.DataType(plc.TypeId.INT8), pa.scalar(1, type=pa.int8()) + ) + with pytest.raises(TypeError): + expr_nodes.ColRef( + literal.dtype, 0, plc.expressions.TableReference.LEFT, literal + ) + + +def test_to_ast_without_colref_raises(): + col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a") + + with pytest.raises(TypeError): + to_ast(col) + + +def test_to_parquet_filter_with_colref_raises(): + col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a") + colref = expr_nodes.ColRef(col.dtype, 0, plc.expressions.TableReference.LEFT, col) + + with pytest.raises(TypeError): + to_parquet_filter(colref) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 8ca7a7b9264..2fcbbf21f1c 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -13,7 +13,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils.versions import POLARS_VERSION_LT_112 +from cudf_polars.utils.versions import POLARS_VERSION_LT_112, POLARS_VERSION_LT_113 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"]) @@ -110,7 +110,11 @@ def test_cross_join(left, right, zlice): @pytest.mark.parametrize( - "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))] + "left_on,right_on", + [ + (pl.col("a"), pl.lit(2, dtype=pl.Int64)), + (pl.lit(2, dtype=pl.Int64), pl.col("a")), + ], ) def test_join_literal_key_unsupported(left, right, left_on, right_on): q = left.join(right, left_on=left_on, right_on=right_on, how="inner") @@ -125,7 +129,13 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on): [pl.col("a_right") <= pl.col("a") * 2], [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")], [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")], - [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + pytest.param( + [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + marks=pytest.mark.xfail( + POLARS_VERSION_LT_113, + reason="https://github.com/pola-rs/polars/issues/19597", + ), + ), ], ) def test_join_where(left, right, conditions, zlice): From 0f1ae264fc88d49e3be00d776597e102ed4730c0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 8 Nov 2024 08:55:15 -0600 Subject: [PATCH 18/40] Wrap custom iterator result (#17251) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: #17165 Fixes: https://github.com/rapidsai/cudf/issues/14481 This PR properly wraps the result of custom iterator. ```python In [2]: import pandas as pd In [3]: s = pd.Series([10, 1, 2, 3, 4, 5]*1000000) # Without custom_iter: In [4]: %timeit for i in s: True 6.34 s ± 25.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # This PR: In [4]: %timeit for i in s: True 6.16 s ± 17.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # On `branch-24.12`: 1.53 s ± 6.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` I think `custom_iter` has to exist. Here is why, invoking any sort of `iteration` on GPU objects will raise errors and thus in the end we fall-back to CPU. Instead of trying to move the objects from host to device memory (if the object is on host memory only), we will avoid a CPU-to-GPU transfer. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17251 --- .../cudf/source/developer_guide/cudf_pandas.md | 3 ++- python/cudf/cudf/pandas/_wrappers/common.py | 11 ++++++++++- python/cudf/cudf/pandas/fast_slow_proxy.py | 4 +++- .../cudf/cudf_pandas_tests/test_cudf_pandas.py | 18 ++++++++++++++++++ 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index 911a64fa152..b653b786129 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -11,7 +11,8 @@ In the rest of this document, to maintain a concrete pair of libraries in mind, For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library). ```{note} -We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +1. We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +2. There is a `custom_iter` method defined to always utilize slow objects `iter` method, that way we don't move the objects to GPU and trigger an error and again move the object to CPU to execute the iteration successfully. ``` ### Types: diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py index 66a51a83896..b801654068e 100644 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ b/python/cudf/cudf/pandas/_wrappers/common.py @@ -52,4 +52,13 @@ def array_interface(self: _FastSlowProxy): def custom_iter(self: _FastSlowProxy): - return iter(self._fsproxy_slow) + """ + Custom iter method to handle the case where only the slow + object's iter method is used. + """ + # NOTE: Do not remove this method. This is required to avoid + # falling back to GPU for iter method. + return _maybe_wrap_result( + iter(self._fsproxy_slow), + None, # type: ignore + ) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 73afde407db..99c0cb82f41 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1099,7 +1099,9 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: """ Wraps "result" in a fast-slow proxy if is a "proxiable" object. """ - if _is_final_type(result): + if isinstance(result, (int, str, float, bool, type(None))): + return result + elif _is_final_type(result): typ = get_final_type_map()[type(result)] return typ._fsproxy_wrap(result, func) elif _is_intermediate_type(result): diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 3e7d1cf3c4c..e260b448219 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1777,3 +1777,21 @@ def test_cudf_pandas_util_version(attrs): assert not hasattr(pd.util, attrs) else: assert hasattr(pd.util, attrs) + + +def test_iteration_over_dataframe_dtypes_produces_proxy_objects(dataframe): + _, xdf = dataframe + xdf["b"] = xpd.IntervalIndex.from_arrays(xdf["a"], xdf["b"]) + xdf["a"] = xpd.Series([1, 1, 1, 2, 3], dtype="category") + dtype_series = xdf.dtypes + assert all(is_proxy_object(x) for x in dtype_series) + assert isinstance(dtype_series.iloc[0], xpd.CategoricalDtype) + assert isinstance(dtype_series.iloc[1], xpd.IntervalDtype) + + +def test_iter_doesnot_raise(monkeypatch): + s = xpd.Series([1, 2, 3]) + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + for _ in s: + pass From 263a7ff78d7777b873bcd79741ab487deb55e87b Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Sat, 9 Nov 2024 00:05:40 +0800 Subject: [PATCH 19/40] Make constructor of DeviceMemoryBufferView public (#17265) Make constructor of DeviceMemoryBufferView and ContiguousTable public. Authors: - Renjie Liu (https://github.com/liurenjie1024) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/17265 --- java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java index e48b1cf59e4..86b6b98f2ae 100644 --- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java +++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ * that is backing it. */ public class DeviceMemoryBufferView extends BaseDeviceMemoryBuffer { - DeviceMemoryBufferView(long address, long lengthInBytes) { + public DeviceMemoryBufferView(long address, long lengthInBytes) { // Set the cleaner to null so we don't end up releasing anything super(address, lengthInBytes, (MemoryBufferCleaner) null); } From c46cf76921570487ca9eed4f73ccdf59ba004f28 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 8 Nov 2024 11:39:33 -0600 Subject: [PATCH 20/40] remove WheelHelpers.cmake (#17276) Related to https://github.com/rapidsai/build-planning/issues/33 and https://github.com/rapidsai/build-planning/issues/74 The last use of CMake function `install_aliased_imported_targets()` here was removed in #16946. This proposes removing the file holding its definition. Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/17276 --- .../libcudf/cmake/Modules/WheelHelpers.cmake | 59 ------------------- 1 file changed, 59 deletions(-) delete mode 100644 python/libcudf/cmake/Modules/WheelHelpers.cmake diff --git a/python/libcudf/cmake/Modules/WheelHelpers.cmake b/python/libcudf/cmake/Modules/WheelHelpers.cmake deleted file mode 100644 index 278d6751c15..00000000000 --- a/python/libcudf/cmake/Modules/WheelHelpers.cmake +++ /dev/null @@ -1,59 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= -include_guard(GLOBAL) - -# Making libraries available inside wheels by installing the associated targets. -function(install_aliased_imported_targets) - list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets") - - set(options "") - set(one_value "DESTINATION") - set(multi_value "TARGETS") - cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN}) - - message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'") - - foreach(target IN LISTS __TARGETS) - - if(NOT TARGET ${target}) - message(VERBOSE "No target named ${target}") - continue() - endif() - - get_target_property(alias_target ${target} ALIASED_TARGET) - if(alias_target) - set(target ${alias_target}) - endif() - - get_target_property(is_imported ${target} IMPORTED) - if(NOT is_imported) - # If the target isn't imported, install it into the wheel - install(TARGETS ${target} DESTINATION ${__DESTINATION}) - message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})") - else() - # If the target is imported, make sure it's global - get_target_property(type ${target} TYPE) - if(${type} STREQUAL "UNKNOWN_LIBRARY") - install(FILES $ DESTINATION ${__DESTINATION}) - message(VERBOSE "install(FILES $ DESTINATION ${__DESTINATION})") - else() - install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION}) - message( - VERBOSE - "install(IMPORTED_RUNTIME_ARTIFACTS $ DESTINATION ${__DESTINATION})" - ) - endif() - endif() - endforeach() -endfunction() From 990734f896c70c91501e580d0a6dc087179ad475 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 8 Nov 2024 13:06:30 -0600 Subject: [PATCH 21/40] Switch to using `TaskSpec` (#17285) https://github.com/dask/dask-expr/pull/1159 made upstream changes in `dask-expr` to use `TaskSpec`, this PR updates `dask-cudf` to be compatible with those changes. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/cudf/pull/17285 --- python/dask_cudf/dask_cudf/io/parquet.py | 26 +++++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 48cea7266af..a7a116875ea 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -5,6 +5,8 @@ from dask_expr.io.io import FusedParquetIO from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS +from dask._task_spec import Task + import cudf from dask_cudf import _deprecated_api @@ -19,7 +21,7 @@ def _load_multiple_files( frag_filters, columns, schema, - *to_pandas_args, + **to_pandas_kwargs, ): import pyarrow as pa @@ -46,7 +48,7 @@ def _load_multiple_files( ) return CudfReadParquetPyarrowFS._table_to_pandas( get(dsk, name), - *to_pandas_args, + **to_pandas_kwargs, ) @@ -89,7 +91,7 @@ def _table_to_pandas(table, index_name): df = df.set_index(index_name) return df - def _filtered_task(self, index: int): + def _filtered_task(self, name, index: int): columns = self.columns.copy() index_name = self.index.name if self.index is not None: @@ -99,16 +101,20 @@ def _filtered_task(self, index: int): if columns is None: columns = list(schema.names) columns.append(index_name) - return ( + return Task( + name, self._table_to_pandas, - ( + Task( + None, self._fragment_to_table, - FragmentWrapper(self.fragments[index], filesystem=self.fs), - self.filters, - columns, - schema, + fragment_wrapper=FragmentWrapper( + self.fragments[index], filesystem=self.fs + ), + filters=self.filters, + columns=columns, + schema=schema, ), - index_name, + index_name=index_name, ) def _tune_up(self, parent): From 2e0d2d6a0859b2cad34a36513b6977cf2bbe172f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Nov 2024 13:15:26 -0800 Subject: [PATCH 22/40] Improve the performance of low cardinality groupby (#16619) This PR enhances groupby performance for low-cardinality input cases. When applicable, it leverages shared memory for initial aggregation, followed by global memory aggregation to reduce atomic contention and improve performance. Authors: - Yunsong Wang (https://github.com/PointKernel) - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - David Wendt (https://github.com/davidwendt) - Mike Wilson (https://github.com/hyperbolic2346) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16619 --- cpp/CMakeLists.txt | 5 +- cpp/src/groupby/groupby.cu | 1 - cpp/src/groupby/hash/compute_aggregations.cu | 29 +++ cpp/src/groupby/hash/compute_aggregations.cuh | 185 ++++++++++++++++++ ...pass_aggs.hpp => compute_aggregations.hpp} | 16 +- .../groupby/hash/compute_aggregations_null.cu | 29 +++ .../hash/compute_global_memory_aggs.cu | 32 +++ .../hash/compute_global_memory_aggs.cuh | 89 +++++++++ .../hash/compute_global_memory_aggs.hpp | 42 ++++ .../hash/compute_global_memory_aggs_null.cu | 32 +++ cpp/src/groupby/hash/compute_groupby.cu | 43 +--- cpp/src/groupby/hash/compute_groupby.hpp | 17 -- .../hash/compute_shared_memory_aggs.cu | 19 +- .../hash/compute_shared_memory_aggs.hpp | 7 +- .../groupby/hash/compute_single_pass_aggs.cu | 99 ---------- .../hash/create_sparse_results_table.cu | 115 ++++++++--- .../hash/create_sparse_results_table.hpp | 27 ++- cpp/src/groupby/hash/helpers.cuh | 2 - cpp/src/groupby/hash/single_pass_functors.cuh | 118 ++++++++++- 19 files changed, 699 insertions(+), 208 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_aggregations.cu create mode 100644 cpp/src/groupby/hash/compute_aggregations.cuh rename cpp/src/groupby/hash/{compute_single_pass_aggs.hpp => compute_aggregations.hpp} (70%) create mode 100644 cpp/src/groupby/hash/compute_aggregations_null.cu create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cu create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cuh create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.hpp create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs_null.cu delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d3bf7019e35..559826ac232 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -394,11 +394,14 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/compute_aggregations.cu + src/groupby/hash/compute_aggregations_null.cu + src/groupby/hash/compute_global_memory_aggs.cu + src/groupby/hash/compute_global_memory_aggs_null.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_mapping_indices.cu src/groupby/hash/compute_mapping_indices_null.cu src/groupby/hash/compute_shared_memory_aggs.cu - src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index cc0682b68b9..6eb82618e2a 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu new file mode 100644 index 00000000000..cac6c2224f0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh new file mode 100644 index 00000000000..e8b29a0e7a8 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_aggregations.hpp" +#include "compute_global_memory_aggs.hpp" +#include "compute_mapping_indices.hpp" +#include "compute_shared_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + // flatten the aggs to a table that can be operated on by aggregate_row + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + + auto const grid_size = + max_occupancy_grid_size>(num_rows); + auto const available_shmem_size = get_available_shared_memory_size(grid_size); + auto const has_sufficient_shmem = + available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2); + auto const has_dictionary_request = std::any_of( + requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { + return cudf::is_dictionary(request.values.type()); + }); + auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem; + + // Performs naive global memory aggregations when the workload is not compatible with shared + // memory, such as when aggregating dictionary columns or when there is insufficient dynamic + // shared memory for shared memory aggregations. + if (!is_shared_memory_compatible) { + return compute_global_memory_aggs(num_rows, + skip_rows_with_nulls, + row_bitmask, + flattened_values, + d_agg_kinds.data(), + agg_kinds, + global_set, + aggs, + sparse_results, + stream); + } + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank + rmm::device_uvector local_mapping_index(num_rows, stream); + // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + + // Flag indicating whether a global memory aggregation fallback is required or not + rmm::device_scalar needs_global_memory_fallback(stream); + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + compute_mapping_indices(grid_size, + num_rows, + global_set_ref, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + needs_global_memory_fallback.data(), + stream); + + cuda::std::atomic_flag h_needs_fallback; + // Cannot use `device_scalar::value` as it requires a copy constructor, which + // `atomic_flag` doesn't have. + CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback, + needs_global_memory_fallback.data(), + sizeof(cuda::std::atomic_flag), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + auto const needs_fallback = h_needs_fallback.test(); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + needs_fallback, + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + + compute_shared_memory_aggs(grid_size, + available_shmem_size, + num_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + // The shared memory groupby is designed so that each thread block can handle up to 128 unique + // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store + // the temporary aggregation results. In these situations, we must fall back to a global memory + // aggregator to process the remaining aggregation requests. + if (needs_fallback) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + global_memory_fallback_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + row_bitmask, + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp similarity index 70% rename from cpp/src/groupby/hash/compute_single_pass_aggs.hpp rename to cpp/src/groupby/hash/compute_aggregations.hpp index a7434bdf61a..829c3c808b0 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -21,6 +21,7 @@ #include #include +#include namespace cudf::groupby::detail::hash { /** @@ -28,11 +29,12 @@ namespace cudf::groupby::detail::hash { * over the data and stores the results in `sparse_results` */ template -void compute_single_pass_aggs(int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - SetType set, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu new file mode 100644 index 00000000000..1d7184227ea --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations_null.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + nullable_global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu new file mode 100644 index 00000000000..6025686953e --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh new file mode 100644 index 00000000000..00db149c6d9 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_global_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + auto constexpr uses_global_memory_aggs = true; + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + uses_global_memory_aggs, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + hash::compute_single_pass_aggs_fn{ + global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggregations.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp new file mode 100644 index 00000000000..0777b9ffd93 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu new file mode 100644 index 00000000000..209e2b7f20a --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + nullable_global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 59457bea694..e1dbf2a3d9e 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include "compute_aggregations.hpp" #include "compute_groupby.hpp" -#include "compute_single_pass_aggs.hpp" #include "helpers.cuh" #include "sparse_to_dense_results.hpp" @@ -29,7 +29,6 @@ #include #include -#include #include #include @@ -38,18 +37,6 @@ #include namespace cudf::groupby::detail::hash { -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream) -{ - rmm::device_uvector populated_keys(num_keys, stream); - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); - return populated_keys; -} - template std::unique_ptr compute_groupby(table_view const& keys, host_span requests, @@ -67,8 +54,8 @@ std::unique_ptr
compute_groupby(table_view const& keys, // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); - auto const set = cuco::static_set{ - num_keys, + auto set = cuco::static_set{ + cuco::extent{num_keys}, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_row_equal, @@ -84,17 +71,13 @@ std::unique_ptr
compute_groupby(table_view const& keys, : rmm::device_buffer{}; // Compute all single pass aggs first - compute_single_pass_aggs(num_keys, - skip_rows_with_nulls, - static_cast(row_bitmask.data()), - set.ref(cuco::insert_and_find), - requests, - &sparse_results, - stream); - - // Extract the populated indices from the hash set and create a gather map. - // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); + auto gather_map = compute_aggregations(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set, + requests, + &sparse_results, + stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(requests, @@ -114,12 +97,6 @@ std::unique_ptr
compute_groupby(table_view const& keys, mr); } -template rmm::device_uvector extract_populated_keys( - global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); - -template rmm::device_uvector extract_populated_keys( - nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); - template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp index 7bb3a60ff07..77243dc0a4f 100644 --- a/cpp/src/groupby/hash/compute_groupby.hpp +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -22,28 +22,11 @@ #include #include -#include #include #include namespace cudf::groupby::detail::hash { -/** - * @brief Computes and returns a device vector containing all populated keys in - * `key_set`. - * - * @tparam SetType Type of key hash set - * - * @param key_set Key hash set - * @param num_keys Number of input keys - * @param stream CUDA stream used for device memory operations and kernel launches - * @return An array of unique keys contained in `key_set` - */ -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream); - /** * @brief Computes groupby using hash table. * diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 12c02a1865e..f0361ccced2 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -47,9 +47,8 @@ struct size_of_functor { /// Shared memory data alignment CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8; -// Prepares shared memory data required by each output column, exits if -// no enough memory space to perform the shared memory aggregation for the -// current output column +// Allocates shared memory required for output columns. Exits if there is insufficient memory to +// perform shared memory aggregation for the current output column. __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, @@ -74,9 +73,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, ALIGNMENT); auto const next_col_total_size = next_col_size + valid_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { - CUDF_UNREACHABLE("Not enough memory for shared memory aggregations"); - } + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } shmem_agg_res_offsets[col_end] = bytes_allocated; shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; @@ -275,7 +272,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, } } // namespace -std::size_t available_shared_memory_size(cudf::size_type grid_size) +std::size_t get_available_shared_memory_size(cudf::size_type grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); @@ -302,11 +299,11 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, { // For each aggregation, need one offset determining where the aggregation is // performed, another indicating the validity of the aggregation - auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); + auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2, + CUDF_EXPECTS(available_shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); - auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2; + auto const shmem_agg_size = available_shmem_size - offsets_size * 2; single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, @@ -318,6 +315,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, output_values, d_agg_kinds, shmem_agg_size, - shmem_offsets_size); + offsets_size); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index 653821fd53b..346956cdab0 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -22,8 +22,12 @@ #include namespace cudf::groupby::detail::hash { +std::size_t get_available_shared_memory_size(cudf::size_type grid_size); -std::size_t available_shared_memory_size(cudf::size_type grid_size); +std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols) +{ + return sizeof(cudf::size_type) * num_cols; +} void compute_shared_memory_aggs(cudf::size_type grid_size, std::size_t available_shmem_size, @@ -37,5 +41,4 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* d_agg_kinds, rmm::cuda_stream_view stream); - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu deleted file mode 100644 index e292543e6e9..00000000000 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_single_pass_aggs.hpp" -#include "create_sparse_results_table.hpp" -#include "flatten_single_pass_aggs.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" -#include "var_hash_functor.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -namespace cudf::groupby::detail::hash { -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -void compute_single_pass_aggs(int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - SetType set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream) -{ - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - - // make table that will hold sparse results - table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, cudf::get_current_device_resource_ref()); - - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - num_keys, - hash::compute_single_pass_aggs_fn{ - set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls}); - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } -} - -template void compute_single_pass_aggs>( - int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - hash_set_ref_t set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); - -template void compute_single_pass_aggs>( - int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - nullable_hash_set_ref_t set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index 22fa4fc584c..bc32e306b3f 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -15,53 +15,110 @@ */ #include "create_sparse_results_table.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" +#include #include #include -#include -#include -#include #include #include #include +#include + +#include #include #include #include namespace cudf::groupby::detail::hash { +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + // make table that will hold sparse results -cudf::table create_sparse_results_table(table_view const& flattened_values, - std::vector aggs, +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream) { // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - sparse_columns.reserve(flattened_values.num_columns()); - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = - (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); - auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + std::vector> sparse_columns; + std::transform(flattened_values.begin(), + flattened_values.end(), + agg_kinds.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto const mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); + } + // Else initialize the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream); + } + return sparse_table; +} - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); +template void extract_populated_keys( + global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); +template void extract_populated_keys( + nullable_global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); - table sparse_table(std::move(sparse_columns)); - mutable_table_view table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(table_view, aggs, stream); - return sparse_table; -} +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + nullable_global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index c1d4e0d3f20..8155ce852e0 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -15,18 +15,41 @@ */ #pragma once +#include #include #include #include #include #include +#include #include namespace cudf::groupby::detail::hash { +/** + * @brief Computes and returns a device vector containing all populated keys in + * `key_set`. + * + * @tparam SetType Type of the key hash set + * + * @param key_set Key hash set + * @param populated_keys Array of unique keys + * @param stream CUDA stream used for device memory operations and kernel launches + * @return An array of unique keys contained in `key_set` + */ +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + // make table that will hold sparse results -cudf::table create_sparse_results_table(table_view const& flattened_values, - std::vector aggs_kinds, +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 00836567b4f..f950e03e0fb 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -23,8 +23,6 @@ #include namespace cudf::groupby::detail::hash { -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance /// Number of threads to handle each input element CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 28a5b578e00..048c9252773 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -15,12 +15,14 @@ */ #pragma once -#include +#include "helpers.cuh" + #include -#include -#include +#include +#include +#include -#include +#include namespace cudf::groupby::detail::hash { // TODO: TO BE REMOVED issue tracked via #17171 @@ -104,6 +106,114 @@ struct initialize_shmem { } }; +template +struct initialize_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_width() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +struct initialize_gmem { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + initialize_target_element_gmem{}(target, target_index); + } +}; + +struct initialize_sparse_table { + cudf::size_type const* row_indices; + cudf::mutable_table_device_view sparse_table; + cudf::aggregation::Kind const* __restrict__ aggs; + initialize_sparse_table(cudf::size_type const* row_indices, + cudf::mutable_table_device_view sparse_table, + cudf::aggregation::Kind const* aggs) + : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs) + { + } + __device__ void operator()(cudf::size_type i) + { + auto key_idx = row_indices[i]; + for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { + cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), + aggs[col_idx], + initialize_gmem{}, + sparse_table.column(col_idx), + key_idx); + } + } +}; + +template +struct global_memory_fallback_fn { + SetType set; + cudf::table_device_view input_values; + cudf::mutable_table_device_view output_values; + cudf::aggregation::Kind const* __restrict__ aggs; + cudf::size_type* block_cardinality; + cudf::size_type stride; + bitmask_type const* __restrict__ row_bitmask; + bool skip_rows_with_nulls; + + global_memory_fallback_fn(SetType set, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + cudf::size_type* block_cardinality, + cudf::size_type stride, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls) + : set(set), + input_values(input_values), + output_values(output_values), + aggs(aggs), + block_cardinality(block_cardinality), + stride(stride), + row_bitmask(row_bitmask), + skip_rows_with_nulls(skip_rows_with_nulls) + { + } + + __device__ void operator()(cudf::size_type i) + { + auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE; + if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { + auto const result = set.insert_and_find(i); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + } + } +}; + /** * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, * and populate `set` with indices of unique keys From d295f17f4468004367fe60088854ac5513519d32 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:22:08 -0500 Subject: [PATCH 23/40] Add `cudf::calendrical_month_sequence` to pylibcudf (#17277) Apart of #15162. Also adds tests for `pylibcudf.filling`. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17277 --- python/cudf/cudf/_lib/datetime.pyx | 21 ++--- python/pylibcudf/pylibcudf/filling.pxd | 6 ++ python/pylibcudf/pylibcudf/filling.pyx | 37 ++++++++ .../pylibcudf/pylibcudf/tests/test_filling.py | 91 +++++++++++++++++++ 4 files changed, 140 insertions(+), 15 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/test_filling.py diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 2c7a585f4b1..7e8f29dac93 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -4,13 +4,7 @@ import warnings from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - cimport pylibcudf.libcudf.datetime as libcudf_datetime -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.filling cimport calendrical_month_sequence -from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.datetime import DatetimeComponent, RoundingFrequency @@ -143,20 +137,17 @@ def is_leap_year(Column col): @acquire_spill_lock() def date_range(DeviceScalar start, size_type n, offset): - cdef unique_ptr[column] c_result cdef size_type months = ( offset.kwds.get("years", 0) * 12 + offset.kwds.get("months", 0) ) - - cdef const scalar* c_start = start.get_raw_ptr() - with nogil: - c_result = move(calendrical_month_sequence( + return Column.from_pylibcudf( + plc.filling.calendrical_month_sequence( n, - c_start[0], - months - )) - return Column.from_unique_ptr(move(c_result)) + start.c_value, + months, + ) + ) @acquire_spill_lock() diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd index b9345f8cd42..56aef086e1b 100644 --- a/python/pylibcudf/pylibcudf/filling.pxd +++ b/python/pylibcudf/pylibcudf/filling.pxd @@ -33,3 +33,9 @@ cpdef Table repeat( Table input_table, ColumnOrSize count ) + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +) diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index a47004a1e42..313605ead16 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.filling cimport ( fill_in_place as cpp_fill_in_place, repeat as cpp_repeat, sequence as cpp_sequence, + calendrical_month_sequence as cpp_calendrical_month_sequence ) from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -164,3 +165,39 @@ cpdef Table repeat( count ) return Table.from_libcudf(move(result)) + + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +): + + """Fill destination column from begin to end with value. + + For details, see :cpp:func:`calendrical_month_sequence`. + + Parameters + ---------- + n : size_type + Number of timestamps to generate + init : Scalar + The initial timestamp + months : size_type + Months to increment + + Returns + ------- + pylibcudf.Column + Timestamps column with sequences of months + """ + + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_calendrical_month_sequence( + n, + dereference(init.c_obj), + months + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_filling.py b/python/pylibcudf/pylibcudf/tests/test_filling.py new file mode 100644 index 00000000000..91c7e42a0a0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_filling.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pytest +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def pa_col(): + return pa.array([2, 3, 5, 7, 11]) + + +@pytest.fixture +def pa_table(): + pa_col = pa.array([1, 2, 3]) + return pa.table([pa_col], names=["a"]) + + +def test_fill(pa_col): + result = plc.filling.fill( + plc.interop.from_arrow(pa_col), + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_fill_in_place(pa_col): + result = plc.interop.from_arrow(pa_col) + plc.filling.fill_in_place( + result, + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_sequence(): + size = 5 + init_scalar = plc.interop.from_arrow(pa.scalar(10)) + step_scalar = plc.interop.from_arrow(pa.scalar(2)) + result = plc.filling.sequence( + size, + init_scalar, + step_scalar, + ) + expect = pa.array([10, 12, 14, 16, 18]) + assert_column_eq(result, expect) + + +def test_repeat_with_count_int(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = 2 + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1, 1, 2, 2, 3, 3]], names=["a"]) + assert_table_eq(expect, result) + + +def test_repeat_with_count_column(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = plc.interop.from_arrow(pa.array([1, 2, 3])) + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1] + [2] * 2 + [3] * 3], names=["a"]) + assert_table_eq(expect, result) + + +def test_calendrical_month_sequence(): + n = 5 + init_date = datetime(2020, 1, 31) + init = plc.interop.from_arrow( + pa.scalar(init_date, type=pa.timestamp("ms")) + ) + months = 1 + result = plc.filling.calendrical_month_sequence(n, init, months) + expected_dates = [ + datetime(2020, 1, 31), + datetime(2020, 2, 29), + datetime(2020, 3, 31), + datetime(2020, 4, 30), + datetime(2020, 5, 31), + ] + expect = pa.array(expected_dates, type=pa.timestamp("ms")) + assert_column_eq(result, expect) From fea46cd869bac0e312a898ca959783aa8db2ad5f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:14:55 -0800 Subject: [PATCH 24/40] Add read_parquet_metadata to pylibcudf (#17245) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17245 --- docs/cudf/source/conf.py | 2 + .../api_docs/pylibcudf/io/index.rst | 1 + .../pylibcudf/io/parquet_metadata.rst | 6 + python/cudf/cudf/_lib/io/utils.pxd | 1 - python/cudf/cudf/_lib/io/utils.pyx | 56 ----- python/cudf/cudf/_lib/parquet.pyx | 67 ++---- python/cudf/cudf/tests/test_parquet.py | 4 +- python/pylibcudf/pylibcudf/io/CMakeLists.txt | 4 +- python/pylibcudf/pylibcudf/io/__init__.pxd | 12 +- python/pylibcudf/pylibcudf/io/__init__.py | 13 +- .../pylibcudf/io/parquet_metadata.pxd | 51 +++++ .../pylibcudf/io/parquet_metadata.pyx | 207 ++++++++++++++++++ .../pylibcudf/libcudf/io/parquet_metadata.pxd | 4 +- 13 files changed, 318 insertions(+), 110 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst create mode 100644 python/pylibcudf/pylibcudf/io/parquet_metadata.pxd create mode 100644 python/pylibcudf/pylibcudf/io/parquet_metadata.pyx diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 5942cc16850..0d463b918d3 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -554,6 +554,8 @@ def on_missing_reference(app, env, node, contnode): nitpick_ignore = [ + # Erroneously warned in ParquetColumnSchema.name + ("py:class", "unicode"), ("py:class", "SeriesOrIndex"), ("py:class", "Dtype"), # The following are erroneously warned due to diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index cd5c5a5f77e..1c1c8040972 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,5 +19,6 @@ I/O Functions csv json parquet + parquet_metadata text timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst new file mode 100644 index 00000000000..fce964f9714 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst @@ -0,0 +1,6 @@ +================ +Parquet Metadata +================ + +.. automodule:: pylibcudf.io.parquet_metadata + :members: diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 76a6e32fde0..96504ebdd66 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -13,7 +13,6 @@ from pylibcudf.libcudf.io.types cimport ( from cudf._lib.column cimport Column -cdef source_info make_source_info(list src) except* cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & data) except* cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 564daefbae2..f23980b387a 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -7,76 +7,20 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -from pylibcudf.io.datasource cimport Datasource from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.datasource cimport datasource from pylibcudf.libcudf.io.types cimport ( column_name_info, - host_buffer, sink_info, - source_info, ) from cudf._lib.column cimport Column import codecs -import errno import io import os from cudf.core.dtypes import StructDtype - -# Converts the Python source input to libcudf IO source_info -# with the appropriate type and source values -cdef source_info make_source_info(list src) except*: - if not src: - raise ValueError("Need to pass at least one source") - - cdef const unsigned char[::1] c_buffer - cdef vector[host_buffer] c_host_buffers - cdef vector[string] c_files - cdef Datasource csrc - cdef vector[datasource*] c_datasources - empty_buffer = False - if isinstance(src[0], bytes): - empty_buffer = True - for buffer in src: - if (len(buffer) > 0): - c_buffer = buffer - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - empty_buffer = False - elif isinstance(src[0], io.BytesIO): - for bio in src: - c_buffer = bio.getbuffer() # check if empty? - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - # Otherwise src is expected to be a numeric fd, string path, or PathLike. - # TODO (ptaylor): Might need to update this check if accepted input types - # change when UCX and/or cuStreamz support is added. - elif isinstance(src[0], Datasource): - for csrc in src: - c_datasources.push_back(csrc.get_datasource()) - return source_info(c_datasources) - elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)): - # If source is a file, return source_info where type=FILEPATH - if not all(os.path.isfile(file) for file in src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - - files = [ str(elem).encode() for elem in src] - c_files = files - return source_info(c_files) - else: - raise TypeError("Unrecognized input type: {}".format(type(src[0]))) - - if empty_buffer is True: - c_host_buffers.push_back(host_buffer(NULL, 0)) - - return source_info(c_host_buffers) - # Converts the Python sink input to libcudf IO sink_info. cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & sink diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 1212637d330..d4bd0cd306c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -27,7 +27,6 @@ from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string -from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move from libcpp.vector cimport vector @@ -41,12 +40,7 @@ from pylibcudf.libcudf.io.parquet cimport ( parquet_writer_options, write_parquet as parquet_writer, ) -from pylibcudf.libcudf.io.parquet_metadata cimport ( - parquet_metadata, - read_parquet_metadata as parquet_metadata_reader, -) from pylibcudf.libcudf.io.types cimport ( - source_info, sink_info, column_in_metadata, table_input_metadata, @@ -62,7 +56,6 @@ from cudf._lib.column cimport Column from cudf._lib.io.utils cimport ( add_df_col_struct_names, make_sinks_info, - make_source_info, ) from cudf._lib.utils cimport table_view_from_table @@ -373,7 +366,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, nrows=nrows, skip_rows=skip_rows) return df -cpdef read_parquet_metadata(filepaths_or_buffers): +cpdef read_parquet_metadata(list filepaths_or_buffers): """ Cython function to call into libcudf API, see `read_parquet_metadata`. @@ -382,56 +375,40 @@ cpdef read_parquet_metadata(filepaths_or_buffers): cudf.io.parquet.read_parquet cudf.io.parquet.to_parquet """ - cdef source_info source = make_source_info(filepaths_or_buffers) - - args = move(source) - - cdef parquet_metadata c_result - - # Read Parquet metadata - with nogil: - c_result = move(parquet_metadata_reader(args)) - - # access and return results - num_rows = c_result.num_rows() - num_rowgroups = c_result.num_rowgroups() - - # extract row group metadata and sanitize keys - row_group_metadata = [{k.decode(): v for k, v in metadata} - for metadata in c_result.rowgroup_metadata()] + parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(filepaths_or_buffers) + ) # read all column names including index column, if any - col_names = [info.name().decode() for info in c_result.schema().root().children()] - - # access the Parquet file_footer to find the index - index_col = None - cdef unordered_map[string, string] file_footer = c_result.metadata() + col_names = [info.name() for info in parquet_metadata.schema().root().children()] - # get index column name(s) - index_col_names = None - json_str = file_footer[b'pandas'].decode('utf-8') - meta = None + index_col_names = set() + json_str = parquet_metadata.metadata()['pandas'] if json_str != "": meta = json.loads(json_str) file_is_range_index, index_col, _ = _parse_metadata(meta) - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} + if ( + not file_is_range_index + and index_col is not None + ): + columns = meta['columns'] for idx_col in index_col: - for c in meta['columns']: + for c in columns: if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] + index_col_names.add(idx_col) # remove the index column from the list of column names # only if index_col_names is not None - if index_col_names is not None: + if len(index_col_names) >= 0: col_names = [name for name in col_names if name not in index_col_names] - # num_columns = length of list(col_names) - num_columns = len(col_names) - - # return the metadata - return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata + return ( + parquet_metadata.num_rows(), + parquet_metadata.num_rowgroups(), + col_names, + len(col_names), + parquet_metadata.rowgroup_metadata() + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c9ce24d2a5b..3c4398a87de 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -405,14 +405,14 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): assert_eq(expect, got) -def test_parquet_read_metadata(tmpdir, pdf): +def test_parquet_read_metadata(tmp_path, pdf): if len(pdf) > 100: pytest.skip("Skipping long setup test") def num_row_groups(rows, group_size): return max(1, (rows + (group_size - 1)) // group_size) - fname = tmpdir.join("metadata.parquet") + fname = tmp_path / "metadata.parquet" row_group_size = 5 pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index f78d97ef4d1..664faef718f 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx - text.pyx types.pyx +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx + parquet_metadata.pyx text.pyx timezone.pyx types.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 6ba7f78a013..663804e714d 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, timezone, text, types +from . cimport ( + avro, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 0fc77dd0f57..9e8e0f6e080 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, timezone, text, types +from . import ( + avro, + csv, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd new file mode 100644 index 00000000000..e421a64adc8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io.parquet_metadata cimport( + parquet_metadata, + parquet_schema, + parquet_column_schema, +) + +cdef class ParquetColumnSchema: + cdef parquet_column_schema column_schema + + @staticmethod + cdef from_column_schema(parquet_column_schema column_schema) + + cpdef str name(self) + + cpdef int num_children(self) + + cpdef ParquetColumnSchema child(self, int idx) + + cpdef list children(self) + + +cdef class ParquetSchema: + cdef parquet_schema schema + + @staticmethod + cdef from_schema(parquet_schema schema) + + cpdef ParquetColumnSchema root(self) + + +cdef class ParquetMetadata: + cdef parquet_metadata meta + + @staticmethod + cdef from_metadata(parquet_metadata meta) + + cpdef ParquetSchema schema(self) + + cpdef int num_rows(self) + + cpdef int num_rowgroups(self) + + cpdef dict metadata(self) + + cpdef list rowgroup_metadata(self) + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info) diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx new file mode 100644 index 00000000000..352905ff0f8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx @@ -0,0 +1,207 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata + + +cdef class ParquetColumnSchema: + """ + Schema of a parquet column, including the nested columns. + + Parameters + ---------- + parquet_column_schema + """ + def __init__(self): + raise ValueError("Construct ParquetColumnSchema with from_column_schema.") + + @staticmethod + cdef from_column_schema(cpp_parquet_metadata.parquet_column_schema column_schema): + cdef ParquetColumnSchema result = ParquetColumnSchema.__new__( + ParquetColumnSchema + ) + result.column_schema = column_schema + return result + + cpdef str name(self): + """ + Returns parquet column name; can be empty. + + Returns + ------- + str + Column name + """ + return self.column_schema.name().decode() + + cpdef int num_children(self): + """ + Returns the number of child columns. + + Returns + ------- + int + Children count + """ + return self.column_schema.num_children() + + cpdef ParquetColumnSchema child(self, int idx): + """ + Returns schema of the child with the given index. + + Parameters + ---------- + idx : int + Child Index + + Returns + ------- + ParquetColumnSchema + Child schema + """ + return ParquetColumnSchema.from_column_schema(self.column_schema.child(idx)) + + cpdef list children(self): + """ + Returns schemas of all child columns. + + Returns + ------- + list[ParquetColumnSchema] + Child schemas. + """ + cdef cpp_parquet_metadata.parquet_column_schema child + return [ + ParquetColumnSchema.from_column_schema(child) + for child in self.column_schema.children() + ] + + +cdef class ParquetSchema: + """ + Schema of a parquet file. + + Parameters + ---------- + parquet_schema + """ + + def __init__(self): + raise ValueError("Construct ParquetSchema with from_schema.") + + @staticmethod + cdef from_schema(cpp_parquet_metadata.parquet_schema schema): + cdef ParquetSchema result = ParquetSchema.__new__(ParquetSchema) + result.schema = schema + return result + + cpdef ParquetColumnSchema root(self): + """ + Returns the schema of the struct column that contains all columns as fields. + + Returns + ------- + ParquetColumnSchema + Root column schema + """ + return ParquetColumnSchema.from_column_schema(self.schema.root()) + + +cdef class ParquetMetadata: + """ + Information about content of a parquet file. + + Parameters + ---------- + parquet_metadata + """ + + def __init__(self): + raise ValueError("Construct ParquetMetadata with from_metadata.") + + @staticmethod + cdef from_metadata(cpp_parquet_metadata.parquet_metadata meta): + cdef ParquetMetadata result = ParquetMetadata.__new__(ParquetMetadata) + result.meta = meta + return result + + cpdef ParquetSchema schema(self): + """ + Returns the parquet schema. + + Returns + ------- + ParquetSchema + Parquet schema + """ + return ParquetSchema.from_schema(self.meta.schema()) + + cpdef int num_rows(self): + """ + Returns the number of rows of the root column. + + Returns + ------- + int + Number of rows + """ + return self.meta.num_rows() + + cpdef int num_rowgroups(self): + """ + Returns the number of rowgroups in the file. + + Returns + ------- + int + Number of row groups. + """ + return self.meta.num_rowgroups() + + cpdef dict metadata(self): + """ + Returns the key-value metadata in the file footer. + + Returns + ------- + dict[bytes, bytes] + Key value metadata as a map. + """ + return {key.decode(): val.decode() for key, val in self.meta.metadata()} + + cpdef list rowgroup_metadata(self): + """ + Returns the row group metadata in the file footer. + + Returns + ------- + list[dict[str, int]] + Vector of row group metadata as maps. + """ + return [ + {key.decode(): val for key, val in metadata} + for metadata in self.meta.rowgroup_metadata() + ] + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info): + """ + Reads metadata of parquet dataset. + + Parameters + ---------- + src_info : SourceInfo + Dataset source. + + Returns + ------- + ParquetMetadata + Parquet_metadata with parquet schema, number of rows, + number of row groups and key-value metadata. + """ + cdef cpp_parquet_metadata.parquet_metadata c_result + + with nogil: + c_result = cpp_parquet_metadata.read_parquet_metadata(src_info.c_obj) + + return ParquetMetadata.from_metadata(c_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd index 8e6da56c9a6..b0ce13e4492 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd @@ -1,11 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -cimport pylibcudf.libcudf.io.types as cudf_io_types from libc.stdint cimport int64_t from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.io.types cimport source_info cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil: @@ -28,4 +28,4 @@ cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil: unordered_map[string, string] metadata() except+ vector[unordered_map[string, int64_t]] rowgroup_metadata() except+ - cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+ + cdef parquet_metadata read_parquet_metadata(source_info src_info) except+ From db69c52d9140d909aeb4af3a5b3db1e7c44c92bc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:46:27 -0800 Subject: [PATCH 25/40] Follow up making Python tests more deterministic (#17272) Addressing comments in https://github.com/rapidsai/cudf/pull/17008/files#r1823318321 and https://github.com/rapidsai/cudf/pull/17008/files#r1823318898 Didn't touch the `_fuzz_testing` directory because maybe we don't want that to be deterministic? Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17272 --- .pre-commit-config.yaml | 4 ++-- python/cudf/cudf/tests/test_parquet.py | 11 +++-------- .../dask_cudf/tests/test_reductions.py | 17 +---------------- python/dask_cudf/dask_cudf/tests/utils.py | 2 +- 4 files changed, 7 insertions(+), 27 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5234f58efe..6d070a8a14c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,8 +90,8 @@ repos: entry: | # Check for usage of default_rng without seeding default_rng\(\)| - # Check for usage of np.random.seed - np.random.seed\( + # Check for usage of np.random.seed (NPY002 only disallows this being called) + np.random.seed language: pygrep types: [python] - id: cmake-format diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 3c4398a87de..96512dacb69 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -193,11 +193,6 @@ def parquet_file(request, tmp_path_factory, pdf): return fname -@pytest.fixture(scope="module") -def rdg_seed(): - return int(os.environ.get("TEST_CUDF_RDG_SEED", "42")) - - def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): test_pdf = pd.DataFrame( [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)], @@ -431,7 +426,7 @@ def num_row_groups(rows, group_size): assert a == b -def test_parquet_read_filtered(tmpdir, rdg_seed): +def test_parquet_read_filtered(tmpdir): # Generate data fname = tmpdir.join("filtered.parquet") dg.generate( @@ -455,13 +450,13 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): dg.ColumnParameters( 40, 0.2, - lambda: np.random.default_rng(seed=None).integers( + lambda: np.random.default_rng(seed=0).integers( 0, 100, size=40 ), True, ), ], - seed=rdg_seed, + seed=42, ), format={"name": "parquet", "row_group_size": 64}, ) diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 4351b672151..f11a5252080 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -1,7 +1,5 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import numpy as np -import pandas as pd import pytest import dask @@ -10,20 +8,7 @@ import cudf import dask_cudf - - -def _make_random_frame(nelem, npartitions=2): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": rng.integers(0, 5, size=nelem), - "y": rng.normal(loc=1.0, scale=1.0, size=nelem), - } - ) - gdf = cudf.DataFrame.from_pandas(df) - dgf = dask_cudf.from_cudf(gdf, npartitions=npartitions) - return df, dgf - +from dask_cudf.tests.utils import _make_random_frame _reducers = ["sum", "count", "mean", "var", "std", "min", "max"] diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index a9f61f75762..b44b3f939e7 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -19,7 +19,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): - rng = np.random.default_rng(seed=None) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( {"x": rng.random(size=nelem), "y": rng.random(size=nelem)} ) From 0fc5fab825ece5b605d84a3d5ef04d7dde31b39f Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Sat, 9 Nov 2024 00:01:26 +0000 Subject: [PATCH 26/40] Use numba-cuda<0.0.18 (#17280) Numba-cuda 0.0.18 (not yet released) contains some changes that might break pynvjitlink patching. In order to avoid breaking RAPIDS CI whilst working through that after releasing numba-cuda 0.0.18 but before the next pynvjitlink, this PR makes use of numba-cuda 0.0.17 or less a requirement. Authors: - Graham Markall (https://github.com/gmarkall) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - https://github.com/brandon-b-miller - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17280 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 6fbdd4ba568..01764411346 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -55,7 +55,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 4aafa12fdae..9074e6332d9 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.1.0.6 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 2aafcae072d..04904e95630 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.0.13 + - numba-cuda >=0.0.13,<0.0.18 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index 59f8f2fda49..e47e0c7523c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -675,7 +675,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.0.13 + - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18 - nvtx>=0.2.1 - packaging - rich diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 41dedc4ff20..ca6dbddfecc 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==24.12.*,>=0.0.0a0", - "numba-cuda>=0.0.13", + "numba-cuda>=0.0.13,<0.0.18", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index c7e4cbc45ea..c4bfc3054bc 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==24.12.*,>=0.0.0a0", - "numba-cuda>=0.0.13", + "numba-cuda>=0.0.13,<0.0.18", "pytest-cov", "pytest-xdist", "pytest<8", From e399e9596d9fe1cf2df0ff1270e2c0c764331b8e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:23:25 -0800 Subject: [PATCH 27/40] Use pylibcudf enums in cudf Python quantile (#17287) Shouldn't need to use the "private" `pylibcudf.libcudf` types anymore now that the Python side enums are exposed Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17287 --- python/cudf/cudf/_lib/quantiles.pyx | 28 +++--------------- python/cudf/cudf/_lib/types.pxd | 5 ---- python/cudf/cudf/_lib/types.pyx | 44 ----------------------------- python/cudf/cudf/core/frame.py | 12 ++++---- 4 files changed, 10 insertions(+), 79 deletions(-) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 7666b7ff8da..509cfe5e9f8 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -6,14 +6,6 @@ from libcpp cimport bool from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_sorted, -) - -from cudf._lib.types import Interpolation - -from pylibcudf.libcudf.types cimport interpolation, sorted from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -28,17 +20,13 @@ def quantile( Column ordered_indices, bool exact, ): - cdef interpolation c_interp = ( - Interpolation[interp.upper()] - ) - return Column.from_pylibcudf( plc.quantiles.quantile( input.to_pylibcudf(mode="read"), q, - c_interp, + plc.types.Interpolation[interp.upper()], ordered_indices.to_pylibcudf(mode="read"), - exact + exact ) ) @@ -51,22 +39,14 @@ def quantile_table( list column_order, list null_precedence, ): - - cdef interpolation c_interp = ( - interp - ) - cdef sorted c_is_input_sorted = ( - is_input_sorted - ) - return columns_from_pylibcudf_table( plc.quantiles.quantiles( plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]), q, - c_interp, - c_is_input_sorted, + interp, + is_input_sorted, column_order, null_precedence ) diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index 4fd3d31841e..c2b760490c1 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -7,12 +7,7 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -ctypedef bool underlying_type_t_order -ctypedef bool underlying_type_t_null_order -ctypedef bool underlying_type_t_sorted -ctypedef int32_t underlying_type_t_interpolation ctypedef int32_t underlying_type_t_type_id -ctypedef bool underlying_type_t_null_policy cdef dtype_from_column_view(column_view cv) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 861bb063707..f169ea12b10 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -11,12 +11,6 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_order, - underlying_type_t_sorted, -) - import pylibcudf import cudf @@ -151,44 +145,6 @@ datetime_unit_map = { size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] -class Interpolation(IntEnum): - LINEAR = ( - libcudf_types.interpolation.LINEAR - ) - LOWER = ( - libcudf_types.interpolation.LOWER - ) - HIGHER = ( - libcudf_types.interpolation.HIGHER - ) - MIDPOINT = ( - libcudf_types.interpolation.MIDPOINT - ) - NEAREST = ( - libcudf_types.interpolation.NEAREST - ) - - -class Order(IntEnum): - ASCENDING = libcudf_types.order.ASCENDING - DESCENDING = libcudf_types.order.DESCENDING - - -class Sorted(IntEnum): - YES = libcudf_types.sorted.YES - NO = libcudf_types.sorted.NO - - -class NullOrder(IntEnum): - BEFORE = libcudf_types.null_order.BEFORE - AFTER = libcudf_types.null_order.AFTER - - -class NullHandling(IntEnum): - INCLUDE = libcudf_types.null_policy.INCLUDE - EXCLUDE = libcudf_types.null_policy.EXCLUDE - - cdef dtype_from_lists_column_view(column_view cv): # lists_column_view have no default constructor, so we heap # allocate it to get around Cython's limitation of requiring diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 205edd91d9d..2b4a17f9559 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -16,6 +16,8 @@ import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar @@ -789,15 +791,13 @@ def _quantile_table( column_order=(), null_precedence=(), ): - interpolation = libcudf.types.Interpolation[interpolation] + interpolation = plc.types.Interpolation[interpolation] - is_sorted = libcudf.types.Sorted["YES" if is_sorted else "NO"] + is_sorted = plc.types.Sorted["YES" if is_sorted else "NO"] - column_order = [libcudf.types.Order[key] for key in column_order] + column_order = [plc.types.Order[key] for key in column_order] - null_precedence = [ - libcudf.types.NullOrder[key] for key in null_precedence - ] + null_precedence = [plc.types.NullOrder[key] for key in null_precedence] return self._from_columns_like_self( libcudf.quantiles.quantile_table( From 7a499f645c040c300e466721a39be65e3e1b054e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 17:38:47 -0800 Subject: [PATCH 28/40] Use more pylibcudf Python enums in cudf._lib (#17288) Similar to https://github.com/rapidsai/cudf/pull/17287. Also remove a `plc` naming shadowing Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17288 --- python/cudf/cudf/_lib/groupby.pyx | 7 ++----- python/cudf/cudf/_lib/json.pyx | 2 +- python/cudf/cudf/_lib/lists.pyx | 8 ++++++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c199ed96d4f..1ce6dfab15e 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -18,7 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib.scalar import as_device_scalar -from pylibcudf.libcudf.replace cimport replace_policy from pylibcudf.libcudf.scalar.scalar cimport scalar import pylibcudf @@ -244,13 +243,11 @@ cdef class GroupBy: return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys) def replace_nulls(self, list values, object method): - # TODO: This is using an enum (replace_policy) that has not been exposed in - # pylibcudf yet. We'll want to fix that import once it is in pylibcudf. _, replaced = self._groupby.replace_nulls( pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), [ - replace_policy.PRECEDING - if method == 'ffill' else replace_policy.FOLLOWING + pylibcudf.replace.ReplacePolicy.PRECEDING + if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING ] * len(values), ) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index fb149603960..7dc9cd01a00 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -104,7 +104,7 @@ cpdef read_json(object filepaths_or_buffers, ) df = cudf.DataFrame._from_data( *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in res_cols], + columns=[Column.from_pylibcudf(col) for col in res_cols], column_names=res_col_names, index_names=None ) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 12432ac6d5d..a91d44274e5 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -49,7 +49,11 @@ def sort_lists(Column col, bool ascending, str na_position): plc.lists.sort_lists( col.to_pylibcudf(mode="read"), ascending, - null_order.BEFORE if na_position == "first" else null_order.AFTER, + ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ), False, ) ) From 5cbdcd07a71fd63813840fdf270d7aec62f1c844 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 8 Nov 2024 21:53:45 -0500 Subject: [PATCH 29/40] Expose delimiter character in JSON reader options to JSON reader APIs (#17266) Fixes #17261 Removes delimiter symbol group from whitespace normalization FST since it is run post-tokenization. Authors: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17266 --- cpp/include/cudf/io/detail/json.hpp | 8 +-- cpp/src/io/json/json_normalization.cu | 49 ++++++++++--------- cpp/src/io/json/read_json.cu | 3 +- .../io/json/json_quote_normalization_test.cpp | 21 ++++++-- 4 files changed, 49 insertions(+), 32 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 940d03cdb41..2e2ac43d6fe 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -57,11 +57,13 @@ void write_json(data_sink* sink, /** * @brief Normalize single quotes to double quotes using FST * - * @param indata Input device buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation + * @param indata Input device buffer + * @param delimiter Line-separating delimiter character in JSONL inputs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation */ void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 34a87918e57..1b61be20202 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " SINGLE_QUOTE_CHAR, ///< Quote character SG: ' ESCAPE_CHAR, ///< Escape character SG: '\' - NEWLINE_CHAR, ///< Newline character SG: '\n' + DELIM_CHAR, ///< Delimiter character SG OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; @@ -72,13 +72,17 @@ constexpr auto TT_SEC = dfa_states::TT_SEC; constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ - {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; +auto get_sgid_lut(SymbolT delim) +{ + // The i-th string representing all the characters of a symbol group + std::array, NUM_SYMBOL_GROUPS - 1> symbol_groups{ + {{'\"'}, {'\''}, {'\\'}, {delim}}}; + return symbol_groups; +} // Transition table std::array, TT_NUM_STATES> const qna_state_tt{{ - /* IN_STATE " ' \ \n OTHER */ + /* IN_STATE " ' \ OTHER */ /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, @@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes { namespace normalize_whitespace { +// We do not need a symbol group for the delimiter character since whitespace normalization +// now occurs after tokenization. enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; // Alias for readability of symbol group ids constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}}; /** * -------- FST states --------- * ----------------------------- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character + * | quotes as well as any other character not enclosed by a string. + * TT_DQS | Double-quoted string state handling all characters within double quotes * TT_DEC | State handling escaped characters inside double-quoted string. Note that this * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. @@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); // Transition table std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + {/* IN_STATE " \ OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; // The DFA's starting state constexpr StateT start_state = static_cast(TT_OOS); @@ -302,18 +304,19 @@ struct TransduceToNormalizedWS { namespace detail { void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; - auto parser = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs), - fst::detail::make_transition_table(normalize_quotes::qna_state_tt), - fst::detail::make_translation_functor( - normalize_quotes::TransduceToNormalizedQuotes{}), - stream); + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)), + fst::detail::make_transition_table(normalize_quotes::qna_state_tt), + fst::detail::make_translation_functor( + normalize_quotes::TransduceToNormalizedQuotes{}), + stream); rmm::device_buffer outbuf(indata.size() * 2, stream, mr); cudf::detail::device_scalar outbuf_size(stream, mr); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 2bc15ea19cb..279f5e71351 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span> sources, // If input JSON buffer has single quotes and option to normalize single quotes is enabled, // invoke pre-processing FST if (reader_opts.is_enabled_normalize_single_quotes()) { - normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); + normalize_single_quotes( + bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref()); } auto buffer = diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index c8c2d18903f..0fbd7da7f4d 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -34,7 +34,9 @@ // Base test fixture for tests struct JsonNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) +void run_test(std::string const& host_input, + std::string const& expected_host_output, + char delimiter = '\n') { // RMM memory resource std::shared_ptr rsc = @@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou // Preprocessing FST cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get()); + cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get()); std::string preprocessed_host_output(device_data.size(), 0); CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), @@ -172,6 +174,13 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces run_test(input, output); } +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter) +{ + std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"}; + std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"}; + run_test(input, output, 'z'); +} + TEST_F(JsonNormalizationTest, ReadJsonOption) { // RMM memory resource @@ -179,22 +188,24 @@ TEST_F(JsonNormalizationTest, ReadJsonOption) std::make_shared(); // Test input - std::string const host_input = R"({"A":'TEST"'})"; + std::string const host_input = R"({"a": "1\n2"}h{'a': 12})"; cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) .lines(true) + .delimiter('h') .normalize_single_quotes(true); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); // Expected table - std::string const expected_input = R"({"A":"TEST\""})"; + std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})"; cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) - .lines(true); + .lines(true) + .delimiter('h'); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); From 84743c3d413f386077ff6f5f162e5d5159449ccd Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 11 Nov 2024 18:19:28 -0600 Subject: [PATCH 30/40] Fix `Dataframe.__setitem__` slow-downs (#17222) Fixes: #17140 This PR fixes slow-downs in `DataFrame.__seitem__` by properly passing in CPU objects where needed instead of passing a GPU object and then failing and performing a GPU -> CPU transfer. `DataFrame.__setitem__` first argument can be a column(pd.Index), in our fast path this will be converted to `cudf.Index` and thus there will be failure from cudf side and then the transfer to CPU + slow-path executes, this is the primary reason for slowdown. This PR maintains a dict mapping of such special functions where we shouldn't be converting the objects to fast path. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17222 --- python/cudf/cudf/pandas/fast_slow_proxy.py | 49 ++++++++++++++++++- .../cudf_pandas_tests/test_cudf_pandas.py | 23 +++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 99c0cb82f41..9768a6c4a2f 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -33,6 +33,20 @@ def call_operator(fn, args, kwargs): "EXECUTE_SLOW": 0x0571B0, } +# This is a dict of functions that are known to have arguments that +# need to be transformed from fast to slow only. i.e., Some cudf functions +# error on passing a device object but don't error on passing a host object. +# For example: DataFrame.__setitem__(arg, value) errors on passing a +# cudf.Index object but doesn't error on passing a pd.Index object. +# Hence we need to transform the arg from fast to slow only. So, we use +# a dictionary like: +# {"DataFrame.__setitem__": {0}} +# where the keys are the function names and the values are the indices +# (0-based) of the arguments that need to be transformed. + +_SPECIAL_FUNCTIONS_ARGS_MAP = { + "DataFrame.__setitem__": {0}, +} _WRAPPER_ASSIGNMENTS = tuple( attr @@ -875,6 +889,10 @@ def __name__(self, value): pass setattr(self._fsproxy_slow, "__name__", value) + @property + def _customqualname(self): + return self._fsproxy_slow.__qualname__ + def _assert_fast_slow_eq(left, right): if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: @@ -1011,7 +1029,36 @@ def _transform_arg( # use __reduce_ex__ instead... if type(arg) is tuple: # Must come first to avoid infinite recursion - return tuple(_transform_arg(a, attribute_name, seen) for a in arg) + if ( + len(arg) > 0 + and isinstance(arg[0], _MethodProxy) + and arg[0]._customqualname in _SPECIAL_FUNCTIONS_ARGS_MAP + ): + indices_map = _SPECIAL_FUNCTIONS_ARGS_MAP[ + arg[0]._customqualname + ] + method_proxy, original_args, original_kwargs = arg + + original_args = tuple( + _transform_arg(a, "_fsproxy_slow", seen) + if i - 1 in indices_map + else _transform_arg(a, attribute_name, seen) + for i, a in enumerate(original_args) + ) + original_kwargs = _transform_arg( + original_kwargs, attribute_name, seen + ) + return tuple( + ( + _transform_arg(method_proxy, attribute_name, seen), + original_args, + original_kwargs, + ) + ) + else: + return tuple( + _transform_arg(a, attribute_name, seen) for a in arg + ) elif hasattr(arg, "__getnewargs_ex__"): # Partial implementation of to reconstruct with # transformed pieces diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index e260b448219..d48fbad0ec3 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -12,6 +12,7 @@ import pickle import subprocess import tempfile +import time import types from io import BytesIO, StringIO @@ -1795,3 +1796,25 @@ def test_iter_doesnot_raise(monkeypatch): monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") for _ in s: pass + + +def test_dataframe_setitem_slowdown(): + # We are explicitly testing the slowdown of the setitem operation + df = xpd.DataFrame( + {"a": [1, 2, 3] * 100000, "b": [1, 2, 3] * 100000} + ).astype("float64") + df = xpd.DataFrame({"a": df["a"].repeat(1000), "b": df["b"].repeat(1000)}) + new_df = df + 1 + start_time = time.time() + df[df.columns] = new_df + end_time = time.time() + delta = int(end_time - start_time) + if delta > 5: + pytest.fail(f"Test took too long to run, runtime: {delta}") + + +def test_dataframe_setitem(): + df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).astype("float64") + new_df = df + 1 + df[df.columns] = new_df + tm.assert_equal(df, new_df) From 61031ccd5977d5d85bf0b8e9c32bea1c853a25ae Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 11 Nov 2024 21:57:47 -0500 Subject: [PATCH 31/40] Expose streams in public quantile APIs (#17257) Adds stream parameter to ``` cudf::quantile cudf::quantiles cudf::percentile_approx ``` Added stream gtests to verify correct stream forwarding. Reference: #13744 Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17257 --- cpp/include/cudf/quantiles.hpp | 6 +++ cpp/src/quantiles/quantile.cu | 3 +- cpp/src/quantiles/quantiles.cu | 11 ++--- cpp/src/quantiles/tdigest/tdigest.cu | 3 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/quantile_test.cpp | 74 ++++++++++++++++++++++++++++ 6 files changed, 88 insertions(+), 10 deletions(-) create mode 100644 cpp/tests/streams/quantile_test.cpp diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp index f6bae170f03..f0039734519 100644 --- a/cpp/include/cudf/quantiles.hpp +++ b/cpp/include/cudf/quantiles.hpp @@ -48,6 +48,7 @@ namespace CUDF_EXPORT cudf { * ignored. * @param[in] exact If true, returns doubles. * If false, returns same type as input. + * @param[in] stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate the returned column's device memory * @returns Column of specified quantiles, with nulls for indeterminable values @@ -59,6 +60,7 @@ std::unique_ptr quantile( interpolation interp = interpolation::LINEAR, column_view const& ordered_indices = {}, bool exact = true, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -85,6 +87,7 @@ std::unique_ptr quantile( * @param is_input_sorted Indicates if the input has been pre-sorted * @param column_order The desired sort order for each column * @param null_precedence The desired order of null compared to other elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @returns Table of specified quantiles, with nulls for indeterminable values @@ -98,6 +101,7 @@ std::unique_ptr
quantiles( cudf::sorted is_input_sorted = sorted::NO, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -114,6 +118,7 @@ std::unique_ptr
quantiles( * * @param input tdigest input data. One tdigest per row * @param percentiles Desired percentiles in range [0, 1] + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device * memory * @@ -125,6 +130,7 @@ std::unique_ptr
quantiles( std::unique_ptr percentile_approx( tdigest::tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 80fd72a3088..21f6fe87a62 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -195,10 +195,11 @@ std::unique_ptr quantile(column_view const& input, interpolation interp, column_view const& ordered_indices, bool exact, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr); + return detail::quantile(input, q, interp, ordered_indices, exact, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index 69421f3bfc4..a94fb9362b9 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -103,17 +103,12 @@ std::unique_ptr
quantiles(table_view const& input, cudf::sorted is_input_sorted, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantiles(input, - q, - interp, - is_input_sorted, - column_order, - null_precedence, - cudf::get_default_stream(), - mr); + return detail::quantiles( + input, q, interp, is_input_sorted, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 43c3b0a291b..fb5aebb4b39 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -410,10 +410,11 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, std::unique_ptr percentile_approx(tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr); + return tdigest::percentile_approx(input, percentiles, stream, mr); } } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f502195aea4..3a9b930830b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -711,6 +711,7 @@ ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) +ConfigureTest(STREAM_QUANTILE_TEST streams/quantile_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/quantile_test.cpp b/cpp/tests/streams/quantile_test.cpp new file mode 100644 index 00000000000..4f4f16a9e70 --- /dev/null +++ b/cpp/tests/streams/quantile_test.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +struct QuantileTest : public cudf::test::BaseFixture {}; + +TEST_F(QuantileTest, TestMultiColumnUnsorted) +{ + auto input_a = cudf::test::strings_column_wrapper( + {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C", + "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"}, + {true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + cudf::test::fixed_width_column_wrapper input_b( + {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto input = cudf::table_view({input_a, input_b}); + + auto actual = cudf::quantiles(input, + {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, + cudf::interpolation::NEAREST, + cudf::sorted::NO, + {cudf::order::ASCENDING, cudf::order::DESCENDING}, + {}, + cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, TestEmpty) +{ + auto input = cudf::test::fixed_width_column_wrapper({}); + cudf::quantile( + input, {0.5, 0.25}, cudf::interpolation::LINEAR, {}, true, cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, EmptyInput) +{ + auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); + cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; + + std::vector input; + input.push_back(*empty_); + input.push_back(*empty_); + input.push_back(*empty_); + auto empty = cudf::concatenate(input, cudf::test::get_default_stream()); + + cudf::tdigest::tdigest_column_view tdv(*empty); + auto result = cudf::percentile_approx(tdv, percentiles, cudf::test::get_default_stream()); +} From bdddab39826c061d3fad932aa306ba9313b1d062 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 12 Nov 2024 04:52:11 +0100 Subject: [PATCH 32/40] cmake option: `CUDF_KVIKIO_REMOTE_IO` (#17291) Compile flag to enable/disable remote IO through KvikIO: `CUDF_KVIKIO_REMOTE_IO` Authors: - Mads R. B. Kristensen (https://github.com/madsbk) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17291 --- cpp/CMakeLists.txt | 12 ++++++++++++ cpp/cmake/thirdparty/get_kvikio.cmake | 2 +- cpp/src/io/utilities/datasource.cpp | 19 ++++++++++++++++--- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 559826ac232..65b05fd518b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -90,6 +90,12 @@ option( mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) +option( + CUDF_KVIKIO_REMOTE_IO + "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec." + ON +) + message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}") @@ -109,6 +115,9 @@ message( "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}" ) message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") +message(VERBOSE + "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}" +) # Set a default build type if none was specified rapids_cmake_build_type("Release") @@ -890,6 +899,9 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL # Define spdlog level target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Enable remote IO through KvikIO +target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index c949f48505e..73f875b46c2 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -22,7 +22,7 @@ function(find_and_configure_kvikio VERSION) GIT_REPOSITORY https://github.com/rapidsai/kvikio.git GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp - OPTIONS "KvikIO_BUILD_EXAMPLES OFF" + OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) include("${rapids-cmake-dir}/export/find_package_root.cmake") diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 9ea39e692b6..5ccc91e4220 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -26,7 +26,6 @@ #include #include -#include #include @@ -37,6 +36,10 @@ #include #include +#ifdef CUDF_KVIKIO_REMOTE_IO +#include +#endif + namespace cudf { namespace io { namespace { @@ -391,6 +394,7 @@ class user_datasource_wrapper : public datasource { datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; +#ifdef CUDF_KVIKIO_REMOTE_IO /** * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. */ @@ -463,14 +467,23 @@ class remote_file_source : public datasource { static bool is_supported_remote_url(std::string const& url) { // Regular expression to match "s3://" - std::regex pattern{R"(^s3://)", std::regex_constants::icase}; + static std::regex pattern{R"(^s3://)", std::regex_constants::icase}; return std::regex_search(url, pattern); } private: kvikio::RemoteHandle _kvikio_file; }; - +#else +/** + * @brief When KvikIO remote IO is disabled, `is_supported_remote_url()` return false always. + */ +class remote_file_source : public file_source { + public: + explicit remote_file_source(char const* filepath) : file_source(filepath) {} + static constexpr bool is_supported_remote_url(std::string const&) { return false; } +}; +#endif } // namespace std::unique_ptr datasource::create(std::string const& filepath, From 202c2318282e859c8a156a48cfbc133dd2941117 Mon Sep 17 00:00:00 2001 From: Peixin Date: Tue, 12 Nov 2024 12:36:44 +0800 Subject: [PATCH 33/40] Replace workaround of JNI build with CUDF_KVIKIO_REMOTE_IO=OFF (#17293) JNI build does not require kvikIO, to unblock the build use `CUDF_KVIKIO_REMOTE_IO=OFF` in cpp build phase. this should be merged after https://github.com/rapidsai/cudf/pull/17291 Authors: - Peixin (https://github.com/pxLi) Approvers: - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17293 --- java/ci/build-in-docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index 4b5379cf0f1..b85c215d7d1 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -65,7 +65,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \ -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \ -DBUILD_SHARED_LIBS=OFF \ - -DKvikIO_REMOTE_SUPPORT=OFF + -DCUDF_KVIKIO_REMOTE_IO=OFF if [[ -z "${PARALLEL_LEVEL}" ]]; then cmake --build . From 043bcbdf28aa9f7213c3f1f2b4170f4940c9d39e Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 12 Nov 2024 07:12:05 -0500 Subject: [PATCH 34/40] [FEA] Report all unsupported operations for a query in cudf.polars (#16960) Closes #16690. The purpose of this PR is to list all of the unique operations that are unsupported by `cudf.polars` when running a query. 1. Question: How to traverse the tree to report the error nodes? Should this be done upstream in Polars? 2. Instead of traversing the query afterwards, we should probably catch each unsupported feature as we translate the IR. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16960 --- python/cudf_polars/cudf_polars/__init__.py | 4 +- python/cudf_polars/cudf_polars/callback.py | 32 +- python/cudf_polars/cudf_polars/dsl/expr.py | 2 + .../cudf_polars/dsl/expressions/base.py | 11 + python/cudf_polars/cudf_polars/dsl/ir.py | 19 +- .../cudf_polars/cudf_polars/dsl/translate.py | 382 ++++++++++-------- .../cudf_polars/testing/asserts.py | 14 +- .../cudf_polars/cudf_polars/utils/dtypes.py | 11 +- python/cudf_polars/docs/overview.md | 4 +- python/cudf_polars/tests/dsl/test_to_ast.py | 4 +- .../cudf_polars/tests/dsl/test_traversal.py | 8 +- .../tests/expressions/test_sort.py | 4 +- python/cudf_polars/tests/test_mapfunction.py | 13 - 13 files changed, 297 insertions(+), 211 deletions(-) diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 66c15f694ee..ba4858c5619 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -12,7 +12,7 @@ from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator # Check we have a supported polars version from cudf_polars.utils.versions import _ensure_polars_version @@ -22,7 +22,7 @@ __all__: list[str] = [ "execute_with_cudf", - "translate_ir", + "Translator", "__git_commit__", "__version__", ] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 76816ee0a61..ff4933c7564 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -18,7 +18,7 @@ import rmm from rmm._cuda import gpu -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: from collections.abc import Generator @@ -180,14 +180,30 @@ def execute_with_cudf( ) try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - nt.set_udf( - partial( - _callback, - translate_ir(nt), - device=device, - memory_resource=memory_resource, + translator = Translator(nt) + ir = translator.translate_ir() + ir_translation_errors = translator.errors + if len(ir_translation_errors): + # TODO: Display these errors in user-friendly way. + # tracked in https://github.com/rapidsai/cudf/issues/17051 + unique_errors = sorted(set(ir_translation_errors), key=str) + error_message = "Query contained unsupported operations" + verbose_error_message = ( + f"{error_message}\nThe errors were:\n{unique_errors}" + ) + unsupported_ops_exception = NotImplementedError( + error_message, unique_errors + ) + if bool(int(os.environ.get("POLARS_VERBOSE", 0))): + warnings.warn(verbose_error_message, UserWarning, stacklevel=2) + if raise_on_fail: + raise unsupported_ops_exception + else: + nt.set_udf( + partial( + _callback, ir, device=device, memory_resource=memory_resource + ) ) - ) except exception as e: if bool(int(os.environ.get("POLARS_VERBOSE", 0))): warnings.warn( diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 1881286ccbb..326d6b65cbe 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -20,6 +20,7 @@ AggInfo, Col, ColRef, + ErrorExpr, Expr, NamedExpr, ) @@ -36,6 +37,7 @@ __all__ = [ "Expr", + "ErrorExpr", "NamedExpr", "Literal", "LiteralColumn", diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py index 21ba7aea707..23851f91938 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -155,6 +155,17 @@ def collect_agg(self, *, depth: int) -> AggInfo: ) # pragma: no cover; check_agg trips first +class ErrorExpr(Expr): + __slots__ = ("error",) + _non_child = ("dtype", "error") + error: str + + def __init__(self, dtype: plc.DataType, error: str) -> None: + self.dtype = dtype + self.error = error + self.children = () + + class NamedExpr: # NamedExpr does not inherit from Expr since it does not appear # when evaluating expressions themselves, only when constructing diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index bc42b4a254f..beea5908e56 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -42,6 +42,7 @@ __all__ = [ "IR", + "ErrorNode", "PythonScan", "Scan", "Cache", @@ -212,6 +213,22 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) +class ErrorNode(IR): + """Represents an error translating the IR.""" + + __slots__ = ("error",) + _non_child = ( + "schema", + "error", + ) + error: str + """The error.""" + + def __init__(self, schema: Schema, error: str): + self.schema = schema + self.error = error + + class PythonScan(IR): """Representation of input from a python function.""" @@ -1532,7 +1549,7 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): raise NotImplementedError( "Unpivot cannot cast all input columns to " f"{self.schema[value_name].id()}" - ) + ) # pragma: no cover self.options = ( tuple(indices), tuple(pivotees), diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 2711676d31e..e8ed009cdf2 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -9,7 +9,7 @@ import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch -from typing import Any +from typing import TYPE_CHECKING, Any import pyarrow as pa from typing_extensions import assert_never @@ -25,7 +25,123 @@ from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes, sorting -__all__ = ["translate_ir", "translate_named_expr"] +if TYPE_CHECKING: + from cudf_polars.typing import NodeTraverser + +__all__ = ["Translator", "translate_named_expr"] + + +class Translator: + """ + Translates polars-internal IR nodes and expressions to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + """ + + def __init__(self, visitor: NodeTraverser): + self.visitor = visitor + self.errors: list[Exception] = [] + + def translate_ir(self, *, n: int | None = None) -> ir.IR: + """ + Translate a polars-internal IR node to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Optional node to start traversing from, if not provided uses + current polars-internal node. + + Returns + ------- + Translated IR object + + Raises + ------ + NotImplementedError + If the version of Polars IR is unsupported. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorNode` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + ctx: AbstractContextManager[None] = ( + set_node(self.visitor, n) if n is not None else noop_context + ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + if (version := self.visitor.version()) >= (4, 0): + e = NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + self.errors.append(e) # pragma: no cover + raise e # pragma: no cover + + with ctx: + polars_schema = self.visitor.get_schema() + try: + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + except Exception as e: + self.errors.append(NotImplementedError(str(e))) + return ir.ErrorNode({}, str(e)) + try: + node = self.visitor.view_current_node() + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + try: + result = _translate_ir(node, self, schema) + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + error = NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + self.errors.append(error) + return ir.ErrorNode(schema, str(error)) + + return result + + def translate_expr(self, *, n: int) -> expr.Expr: + """ + Translate a polars-internal expression IR into our representation. + + Parameters + ---------- + n + Node to translate, an integer referencing a polars internal node. + + Returns + ------- + Translated IR object. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorExpr` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + node = self.visitor.view_expression(n) + dtype = dtypes.from_polars(self.visitor.get_dtype(n)) + try: + return _translate_expr(node, self, dtype) + except Exception as e: + self.errors.append(e) + return expr.ErrorExpr(dtype, str(e)) class set_node(AbstractContextManager[None]): @@ -67,7 +183,7 @@ def __exit__(self, *args: Any) -> None: @singledispatch def _translate_ir( - node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: Any, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -76,19 +192,19 @@ def _translate_ir( @_translate_ir.register def _( - node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.PythonScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: scan_fn, with_columns, source_type, predicate, nrows = node.options options = (scan_fn, with_columns, source_type, nrows) predicate = ( - translate_named_expr(visitor, n=predicate) if predicate is not None else None + translate_named_expr(translator, n=predicate) if predicate is not None else None ) return ir.PythonScan(schema, options, predicate) @_translate_ir.register def _( - node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Scan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: typ, *options = node.scan_type if typ == "ndjson": @@ -117,7 +233,7 @@ def _( skip_rows, n_rows, row_index, - translate_named_expr(visitor, n=node.predicate) + translate_named_expr(translator, n=node.predicate) if node.predicate is not None else None, ) @@ -125,20 +241,20 @@ def _( @_translate_ir.register def _( - node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Cache, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) + return ir.Cache(schema, node.id_, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.DataFrameScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.DataFrameScan( schema, node.df, node.projection, - translate_named_expr(visitor, n=node.selection) + translate_named_expr(translator, n=node.selection) if node.selection is not None else None, ) @@ -146,22 +262,22 @@ def _( @_translate_ir.register def _( - node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Select, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] return ir.Select(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.GroupBy, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] - keys = [translate_named_expr(visitor, n=e) for e in node.keys] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + aggs = [translate_named_expr(translator, n=e) for e in node.aggs] + keys = [translate_named_expr(translator, n=e) for e in node.keys] return ir.GroupBy( schema, keys, @@ -174,17 +290,17 @@ def _( @_translate_ir.register def _( - node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Join, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. - with set_node(visitor, node.input_left): - inp_left = translate_ir(visitor, n=None) - left_on = [translate_named_expr(visitor, n=e) for e in node.left_on] - with set_node(visitor, node.input_right): - inp_right = translate_ir(visitor, n=None) - right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] + with set_node(translator.visitor, node.input_left): + inp_left = translator.translate_ir(n=None) + left_on = [translate_named_expr(translator, n=e) for e in node.left_on] + with set_node(translator.visitor, node.input_right): + inp_right = translator.translate_ir(n=None) + right_on = [translate_named_expr(translator, n=e) for e in node.right_on] if (how := node.options[0]) in { "inner", "left", @@ -239,27 +355,27 @@ def _( @_translate_ir.register def _( - node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HStack, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.exprs] return ir.HStack(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Reduce, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # pragma: no cover; polars doesn't emit this node yet - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] return ir.Reduce(schema, exprs, inp) @_translate_ir.register def _( - node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Distinct, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: (keep, subset, maintain_order, zlice) = node.options keep = ir.Distinct._KEEP_MAP[keep] @@ -270,17 +386,17 @@ def _( subset, zlice, maintain_order, - translate_ir(visitor, n=node.input), + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Sort, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - by = [translate_named_expr(visitor, n=e) for e in node.by_column] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + by = [translate_named_expr(translator, n=e) for e in node.by_column] stable, nulls_last, descending = node.sort_options order, null_order = sorting.sort_order( descending, nulls_last=nulls_last, num_keys=len(by) @@ -290,33 +406,35 @@ def _( @_translate_ir.register def _( - node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Slice, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input)) + return ir.Slice( + schema, node.offset, node.len, translator.translate_ir(n=node.input) + ) @_translate_ir.register def _( - node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Filter, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - mask = translate_named_expr(visitor, n=node.predicate) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + mask = translate_named_expr(translator, n=node.predicate) return ir.Filter(schema, mask, inp) @_translate_ir.register def _( node: pl_ir.SimpleProjection, - visitor: NodeTraverser, + translator: Translator, schema: dict[str, plc.DataType], ) -> ir.IR: - return ir.Projection(schema, translate_ir(visitor, n=node.input)) + return ir.Projection(schema, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: name, *options = node.function return ir.MapFunction( @@ -324,83 +442,36 @@ def _( name, options, # TODO: merge_sorted breaks this pattern - translate_ir(visitor, n=node.input), + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Union, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.Union( - schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs) + schema, node.options, *(translator.translate_ir(n=n) for n in node.inputs) ) @_translate_ir.register def _( - node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HConcat, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs)) - - -def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: - """ - Translate a polars-internal IR node to our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Optional node to start traversing from, if not provided uses - current polars-internal node. - - Returns - ------- - Translated IR object - - Raises - ------ - NotImplementedError - If we can't translate the nodes due to unsupported functionality. - """ - ctx: AbstractContextManager[None] = ( - set_node(visitor, n) if n is not None else noop_context - ) - # IR is versioned with major.minor, minor is bumped for backwards - # compatible changes (e.g. adding new nodes), major is bumped for - # incompatible changes (e.g. renaming nodes). - if (version := visitor.version()) >= (4, 0): - raise NotImplementedError( - f"No support for polars IR {version=}" - ) # pragma: no cover; no such version for now. - - with ctx: - polars_schema = visitor.get_schema() - node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} - result = _translate_ir(node, visitor, schema) - if any( - isinstance(dtype, pl.Null) - for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) - ): - raise NotImplementedError( - f"No GPU support for {result} with Null column dtype." - ) - return result + return ir.HConcat(schema, *(translator.translate_ir(n=n) for n in node.inputs)) def translate_named_expr( - visitor: NodeTraverser, *, n: pl_expr.PyExprIR + translator: Translator, *, n: pl_expr.PyExprIR ) -> expr.NamedExpr: """ Translate a polars-internal named expression IR object into our representation. Parameters ---------- - visitor - Polars NodeTraverser object + translator + Translator object n Node to translate, a named expression node. @@ -420,12 +491,12 @@ def translate_named_expr( NotImplementedError If any translation fails due to unsupported functionality. """ - return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + return expr.NamedExpr(n.output_name, translator.translate_expr(n=n.node)) @singledispatch def _translate_expr( - node: Any, visitor: NodeTraverser, dtype: plc.DataType + node: Any, translator: Translator, dtype: plc.DataType ) -> expr.Expr: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -433,7 +504,7 @@ def _translate_expr( @_translate_expr.register -def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): @@ -442,7 +513,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex pl_expr.StringFunction.StripCharsStart, pl_expr.StringFunction.StripCharsEnd, }: - column, chars = (translate_expr(visitor, n=n) for n in node.input) + column, chars = (translator.translate_expr(n=n) for n in node.input) if isinstance(chars, expr.Literal): if chars.value == pa.scalar(""): # No-op in polars, but libcudf uses empty string @@ -459,11 +530,11 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.BooleanFunction): if name == pl_expr.BooleanFunction.IsBetween: - column, lo, hi = (translate_expr(visitor, n=n) for n in node.input) + column, lo, hi = (translator.translate_expr(n=n) for n in node.input) (closed,) = options lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed] return expr.BinOp( @@ -476,7 +547,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): # functions for which evaluation of the expression may not return @@ -496,14 +567,14 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) if name in needs_cast: return expr.Cast(dtype, result_expr) return result_expr elif isinstance(name, str): - children = (translate_expr(visitor, n=n) for n in node.input) + children = (translator.translate_expr(n=n) for n in node.input) if name == "log": (base,) = options (child,) = children @@ -522,26 +593,26 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex @_translate_expr.register -def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Window, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby? if isinstance(node.options, pl_expr.RollingGroupOptions): # pl.col("a").rolling(...) return expr.RollingWindow( - dtype, node.options, translate_expr(visitor, n=node.function) + dtype, node.options, translator.translate_expr(n=node.function) ) elif isinstance(node.options, pl_expr.WindowMapping): # pl.col("a").over(...) return expr.GroupedRollingWindow( dtype, node.options, - translate_expr(visitor, n=node.function), - *(translate_expr(visitor, n=n) for n in node.partition_by), + translator.translate_expr(n=node.function), + *(translator.translate_expr(n=n) for n in node.partition_by), ) assert_never(node.options) @_translate_expr.register -def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Literal, translator: Translator, dtype: plc.DataType) -> expr.Expr: if isinstance(node.value, plrs.PySeries): return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value)) value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) @@ -549,42 +620,42 @@ def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> exp @_translate_expr.register -def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby - return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) + return expr.Sort(dtype, node.options, translator.translate_expr(n=node.expr)) @_translate_expr.register -def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.SortBy( dtype, node.sort_options, - translate_expr(visitor, n=node.expr), - *(translate_expr(visitor, n=n) for n in node.by), + translator.translate_expr(n=node.expr), + *(translator.translate_expr(n=n) for n in node.by), ) @_translate_expr.register -def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Gather, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Gather( dtype, - translate_expr(visitor, n=node.expr), - translate_expr(visitor, n=node.idx), + translator.translate_expr(n=node.expr), + translator.translate_expr(n=node.idx), ) @_translate_expr.register -def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Filter, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Filter( dtype, - translate_expr(visitor, n=node.input), - translate_expr(visitor, n=node.by), + translator.translate_expr(n=node.input), + translator.translate_expr(n=node.by), ) @_translate_expr.register -def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - inner = translate_expr(visitor, n=node.expr) +def _(node: pl_expr.Cast, translator: Translator, dtype: plc.DataType) -> expr.Expr: + inner = translator.translate_expr(n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) @@ -596,17 +667,17 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E @_translate_expr.register -def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Column, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Col(dtype, node.name) @_translate_expr.register -def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Agg, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Agg( dtype, node.name, node.options, - *(translate_expr(visitor, n=n) for n in node.arguments), + *(translator.translate_expr(n=n) for n in node.arguments), ) if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: return expr.Cast(value.dtype, value) @@ -614,55 +685,30 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex @_translate_expr.register -def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Ternary, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Ternary( dtype, - translate_expr(visitor, n=node.predicate), - translate_expr(visitor, n=node.truthy), - translate_expr(visitor, n=node.falsy), + translator.translate_expr(n=node.predicate), + translator.translate_expr(n=node.truthy), + translator.translate_expr(n=node.falsy), ) @_translate_expr.register def _( - node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType + node: pl_expr.BinaryExpr, translator: Translator, dtype: plc.DataType ) -> expr.Expr: return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], - translate_expr(visitor, n=node.left), - translate_expr(visitor, n=node.right), + translator.translate_expr(n=node.left), + translator.translate_expr(n=node.right), ) @_translate_expr.register -def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Len, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Len(dtype) if dtype.id() != plc.TypeId.INT32: return expr.Cast(dtype, value) return value # pragma: no cover; never reached since polars len has uint32 dtype - - -def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: - """ - Translate a polars-internal expression IR into our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Node to translate, an integer referencing a polars internal node. - - Returns - ------- - Translated IR object. - - Raises - ------ - NotImplementedError - If any translation fails due to unsupported functionality. - """ - node = visitor.view_expression(n) - dtype = dtypes.from_polars(visitor.get_dtype(n)) - return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 7b45c1eaa06..2207545aa60 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -10,7 +10,7 @@ from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: import polars as pl @@ -117,12 +117,14 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) AssertionError If the specified exceptions were not raised. """ - try: - _ = translate_ir(q._ldf.visit()) - except exceptions: + translator = Translator(q._ldf.visit()) + translator.translate_ir() + if errors := translator.errors: + for err in errors: + assert any( + isinstance(err, err_type) for err_type in exceptions + ), f"Translation DID NOT RAISE {exceptions}" return - except Exception as e: - raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index a90c283ee54..e7ac72df609 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -71,11 +71,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ------- True if casting is supported, False otherwise """ + has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY return ( ( - plc.traits.is_fixed_width(to) - and plc.traits.is_fixed_width(from_) - and plc.unary.is_supported_cast(from_, to) + from_ == to + or not has_empty + and ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) ) or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_)) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 17a94c633f8..2f2361223d2 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -458,12 +458,12 @@ translate it to our intermediate representation (IR), and then execute and convert back to polars: ```python -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator q = ... # Convert to our IR -ir = translate_ir(q._ldf.visit()) +ir = Translator(q._ldf.visit()).translate_ir() # DataFrame living on the device result = ir.evaluate(cache={}) diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py index 8f10f119199..f6c24da0180 100644 --- a/python/cudf_polars/tests/dsl/test_to_ast.py +++ b/python/cudf_polars/tests/dsl/test_to_ast.py @@ -13,7 +13,7 @@ import cudf_polars.dsl.expr as expr_nodes import cudf_polars.dsl.ir as ir_nodes -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.containers.dataframe import DataFrame, NamedColumn from cudf_polars.dsl.to_ast import insert_colrefs, to_ast, to_parquet_filter @@ -60,7 +60,7 @@ def df(): ) def test_compute_column(expr, df): q = df.select(expr) - ir = translate_ir(q._ldf.visit()) + ir = Translator(q._ldf.visit()).translate_ir() assert isinstance(ir, ir_nodes.Select) table = ir.children[0].evaluate(cache={}) diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py index 15c644d7978..8958c2a0f84 100644 --- a/python/cudf_polars/tests/dsl/test_traversal.py +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -10,7 +10,7 @@ import pylibcudf as plc -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.dsl import expr, ir from cudf_polars.dsl.traversal import ( CachingVisitor, @@ -109,7 +109,7 @@ def test_rewrite_ir_node(): df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]}) q = df.group_by("a").agg(pl.col("b").sum()).sort("b") - orig = translate_ir(q._ldf.visit()) + orig = Translator(q._ldf.visit()).translate_ir() new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]}) @@ -150,7 +150,7 @@ def replace_scan(node, rec): mapper = CachingVisitor(replace_scan) - orig = translate_ir(q._ldf.visit()) + orig = Translator(q._ldf.visit()).translate_ir() new = mapper(orig) result = new.evaluate(cache={}).to_polars() @@ -174,7 +174,7 @@ def test_rewrite_names_and_ops(): .collect() ) - qir = translate_ir(q._ldf.visit()) + qir = Translator(q._ldf.visit()).translate_ir() @singledispatch def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr: diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 62df8ce1498..6170281ad54 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -10,7 +10,7 @@ import pylibcudf as plc -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -68,7 +68,7 @@ def test_setsorted(descending, nulls_last, with_nulls): assert_gpu_result_equal(q) - df = translate_ir(q._ldf.visit()).evaluate(cache={}) + df = Translator(q._ldf.visit()).translate_ir().evaluate(cache={}) a = df.column_map["a"] diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index e895f27f637..63aa1c573a9 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -93,16 +93,3 @@ def test_unpivot_defaults(): ) q = df.unpivot(index="d") assert_gpu_result_equal(q) - - -def test_unpivot_unsupported_cast_raises(): - df = pl.LazyFrame( - { - "a": ["x", "y", "z"], - "b": pl.Series([1, 3, 5], dtype=pl.Int16), - } - ) - - q = df.unpivot(["a", "b"]) - - assert_ir_translation_raises(q, NotImplementedError) From ccfc95a623e13d59a6e4f640ee7c022bda35f763 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:03:06 -0500 Subject: [PATCH 35/40] Add new nvtext minhash_permuted API (#16756) Introduce new nvtext minhash API that takes a single seed for hashing and 2 parameter vectors to calculate the minhash results from the seed hash: ``` std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The `seed` is used to hash the `input` using rolling set of substrings `width` characters wide. The hashes are then combined with the values in `parameter_a` and `parameter_b` to calculate a set of 32-bit (or 64-bit) values for each row. Only the minimum value is returned per element of `a` and `b` when combined with all the hashes for a row. Each output row is a set of M values where `M = parameter_a.size() = parameter_b.size()` This implementation is significantly faster than the current minhash which computes hashes for multiple seeds. Included in this PR is also the `minhash64_permuted()` API that is identical but uses 64-bit values for the seed and the parameter values. Also included are new tests and a benchmark as well as the pylibcudf and cudf interfaces. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Murray (https://github.com/Matt711) - Lawrence Mitchell (https://github.com/wence-) - Karthikeyan (https://github.com/karthikeyann) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/16756 --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/text/minhash.cpp | 38 +- cpp/include/nvtext/minhash.hpp | 94 +++++ cpp/src/text/minhash.cu | 390 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/minhash_tests.cpp | 267 ++++++------ python/cudf/cudf/_lib/nvtext/minhash.pyx | 28 ++ python/cudf/cudf/_lib/strings/__init__.py | 2 + python/cudf/cudf/core/column/string.py | 107 +++++ .../cudf/cudf/tests/text/test_text_methods.py | 48 +-- .../pylibcudf/libcudf/nvtext/minhash.pxd | 16 + python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 16 + python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 103 +++++ .../pylibcudf/tests/test_nvtext_minhash.py | 12 +- 14 files changed, 949 insertions(+), 177 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ad090be99f3..59f5602fd5a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 31ce60d8f9a..a80d0dcbdb8 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -20,8 +20,6 @@ #include -#include - #include static void bench_minhash(nvbench::state& state) @@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const parameters = static_cast(state.get_int64("parameters")); auto const base64 = state.get_int64("hash_type") == 64; - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); - data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - seeds.set_null_mask(rmm::device_buffer{}, 0); + data_profile const param_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), + distribution_id::NORMAL, + 0u, + std::numeric_limits::max()); + auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const param_table = + create_random_table({param_type, param_type}, row_count{parameters}, param_profile); + auto const parameters_a = param_table->view().column(0); + auto const parameters_b = param_table->view().column(1); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 + ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) - .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10}) - .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("num_rows", {15000, 30000, 60000}) + .add_int64_axis("row_width", {6000, 28000, 50000}) + .add_int64_axis("hash_width", {12, 24}) + .add_int64_axis("parameters", {26, 260}) .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 42124461cdf..b2c1a23f57e 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_permuted( + cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash value for each string * @@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash values for each row of strings per seed * diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index a03a34f5fa7..aee83ab35ed 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -37,9 +38,13 @@ #include #include +#include #include +#include #include #include +#include +#include #include @@ -162,6 +167,339 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } +constexpr cudf::thread_index_type block_size = 256; +// for potentially tuning minhash_seed_kernel independently from block_size +constexpr cudf::thread_index_type tile_size = block_size; + +// Number of a/b parameter values to process per thread. +// The intermediate values are stored in shared-memory and therefore limits this count. +// This value was found to be an efficient size for both uint32 and uint64 +// hash types based on benchmarks. +constexpr cuda::std::size_t params_per_thread = 16; + +// Separate kernels are used to process strings above and below this value (in bytes). +constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +// The number of blocks per string for the above-threshold kernel processing. +constexpr cudf::size_type blocks_per_string = 64; +// The above values were determined using the redpajama and books_sample datasets + +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * + * This kernel computes the hashes for each string using the seed and the specified + * hash function. The width is used to compute rolling substrings to hash over. + * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel. + * + * This kernel also counts the number of strings above the wide_string_threshold + * and proactively initializes the output values for those strings. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_strings The input strings to hash + * @param seed The seed used for the hash function + * @param width Width in characters used for determining substrings to hash + * @param d_hashes The resulting hash values are stored here + * @param threshold_count Stores the number of strings above wide_string_threshold + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) + */ +template +CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, + hash_value_type seed, + cudf::size_type width, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, + cudf::size_type param_count, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = tid / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // retrieve this string's offset to locate the output position in d_hashes + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + if (size_bytes == 0) { return; } + + auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); + auto const lane_idx = tid % tile_size; + + // hashes for this string/thread are stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + + auto const begin = d_str.data() + lane_idx; + auto const end = d_str.data() + d_str.size_bytes(); + auto const hasher = HashFunction(seed); + + for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { + *seed_hashes = 0; + continue; + } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { + // true itr+width is past the end of the string + *seed_hashes = 0; + continue; + } + + auto const hash_str = cudf::string_view(itr, bytes); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = thrust::get<0>(hasher(hash_str)); + } + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here so an extra kernel is not required + if (size_bytes >= wide_string_threshold) { + if (lane_idx == 0) { + // count the number of wide strings + cuda::atomic_ref ref{*threshold_count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output -- only needed for wider strings + auto d_output = d_results + (str_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = std::numeric_limits::max(); + } + } +} + +/** + * @brief Permutation calculation kernel + * + * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and + * parameter_b values to compute the final output results. + * The output is the number of input rows (N) by the number of parameter values (M). + * Each output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per strings of 1 for strings + * below the wide_strings_threshold or blocks per string = blocks_per_strings + * for strings above wide_strings_threshold. + * + * @tparam hash_value_type Derived from HashFunction result_type + * @tparam blocks_per_string Number of blocks used to process each string + * + * @param d_strings The input strings to hash + * @param indices The indices of the strings in d_strings to process + * @param parameter_a 1st set of parameters for the calculation result + * @param parameter_b 2nd set of parameters for the calculation result + * @param width Used for calculating the number of available hashes in each string + * @param d_hashes The hash values computed in minhash_seed_kernel + * @param d_results Final results vector of calculate values + */ +template +CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const idx = (tid / blocks_per_string) / block_size; + if (idx >= indices.size()) { return; } + auto const str_idx = indices[idx]; + if (d_strings.is_null(str_idx)) { return; } + + auto const block = cooperative_groups::this_thread_block(); + int const section_idx = block.group_index().x % blocks_per_string; + + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + + // number of items to process in this block; + // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // example: + // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // except the last section which is 9188 + (588090 % 64) = 9246 + auto const section_size = + (size_bytes / blocks_per_string) + + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); + auto const section_offset = section_idx * (size_bytes / blocks_per_string); + + // hash values for this block/section + auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; + // width used here as a max value since a string's char-count <= byte-count + auto const hashes_size = + section_idx < (blocks_per_string - 1) + ? section_size + : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); + + auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const lane_idx = block.thread_rank(); + auto const d_output = d_results + (str_idx * parameter_a.size()); + + auto const begin = seed_hashes + lane_idx; + auto const end = seed_hashes + hashes_size; + + // constants used in the permutation calculations + constexpr uint64_t mersenne_prime = (1UL << 61) - 1; + constexpr hash_value_type hash_max = std::numeric_limits::max(); + + // found to be an efficient shared memory size for both hash types + __shared__ hash_value_type block_values[block_size * params_per_thread]; + + for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { + // initialize this block's chunk of shared memory + // each thread handles params_per_thread of values + auto const chunk_values = block_values + (lane_idx * params_per_thread); + thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init); + block.sync(); + + auto const param_count = + cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); + + // each lane accumulates min hashes in its shared memory + for (auto itr = begin; itr < end; itr += block_size) { + auto const hv = *itr; + // 0 is used as a skip sentinel for UTF-8 and trailing bytes + if (hv == 0) { continue; } + + for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { + // permutation formula used by datatrove + hash_value_type const v = + ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; + block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); + } + } + block.sync(); + + // reduce each parameter values vector to a single min value; + // assumes that the block_size > params_per_thread; + // each thread reduces a block_size of parameter values (thread per parameter) + if (lane_idx < param_count) { + auto const values = block_values + (lane_idx * block_size); + // cooperative groups does not have a min function and cub::BlockReduce was slower + auto const minv = + thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + if constexpr (blocks_per_string > 1) { + // accumulates mins for each block into d_output + cuda::atomic_ref ref{d_output[lane_idx + i]}; + ref.fetch_min(minv, cuda::std::memory_order_relaxed); + } else { + d_output[lane_idx + i] = minv; + } + } + block.sync(); + } +} + +template +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + hash_value_type seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto results = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_results = results->mutable_view().data(); + + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, + block_size}; + auto const hashes_size = input.chars_size(stream); + auto d_hashes = rmm::device_uvector(hashes_size, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); + + minhash_seed_kernel + <<>>(*d_strings, + seed, + width, + d_hashes.data(), + d_threshold_count.data(), + parameter_a.size(), + d_results); + auto const threshold_count = d_threshold_count.value(stream); + + auto indices = rmm::device_uvector(input.size(), stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < input.size())) { + auto sizes = rmm::device_uvector(input.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + sizes.data(), + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + })); + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto const lb = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + } + + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto const count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + return results; +} + /** * @brief Compute the minhash of each list row of strings for each seed * @@ -309,6 +647,20 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar const& seed, cudf::size_type width, @@ -333,6 +685,20 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash64(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, @@ -374,6 +740,18 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seeds, width, stream, mr); } +std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, @@ -394,6 +772,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3a9b930830b..cbca0ceef77 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -610,6 +610,7 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp + text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index ef35a4472cf..042ac44621e 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -28,155 +28,169 @@ struct MinHashTest : public cudf::test::BaseFixture {}; -TEST_F(MinHashTest, Basic) +TEST_F(MinHashTest, Permuted) { - auto validity = cudf::test::iterators::null_at(1); auto input = cudf::test::strings_column_wrapper({"doc 1", - "", "this is doc 2", - "", "doc 3", "d", - "The quick brown fox jumpéd over the lazy brown dog."}, - validity); + "The quick brown fox jumpéd over the lazy brown dog.", + "line six", + "line seven", + "line eight", + "line nine", + "line ten"}); auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1392101586u, 394869177u, 811528444u}, + LCW32{ 211415830u, 187088503u, 130291444u}, + LCW32{2098117052u, 394869177u, 799753544u}, + LCW32{2264583304u, 2920538364u, 3576493424u}, + LCW32{ 253327882u, 41747273u, 302030804u}, + LCW32{2109809594u, 1017470651u, 326988172u}, + LCW32{1303819864u, 850676747u, 147107852u}, + LCW32{ 736021564u, 720812292u, 1405158760u}, + LCW32{ 902780242u, 134064807u, 1613944636u}, + LCW32{ 547084870u, 1748895564u, 656501844u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto results64 = nvtext::minhash64(view); - auto expected64 = cudf::test::fixed_width_column_wrapper({774489391575805754ul, - 0ul, - 3232308021562742685ul, - 0ul, - 13145552576991307582ul, - 14660046701545912182ul, - 398062025280761388ul}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); -TEST_F(MinHashTest, LengthEqualsWidth) -{ - auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); - auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view, 0, 5); - auto expected = cudf::test::fixed_width_column_wrapper( - {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 827364888116975697ul, 1601854279692781452ul, 70500662054893256ul}, + LCW64{ 18312093741021833ul, 133793446674258329ul, 21974512489226198ul}, + LCW64{ 22474244732520567ul, 1638811775655358395ul, 949306297364502264ul}, + LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul}, + LCW64{ 65816830624808020ul, 43323600380520789ul, 63511816333816345ul}, + LCW64{ 629657184954525200ul, 49741036507643002ul, 97466271004074331ul}, + LCW64{ 301611977846331113ul, 101188874709594830ul, 97466271004074331ul}, + LCW64{ 121498891461700668ul, 171065800427907402ul, 97466271004074331ul}, + LCW64{ 54617739511834072ul, 231454301607238929ul, 97466271004074331ul}, + LCW64{ 576418665851990314ul, 231454301607238929ul, 97466271004074331ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeed) +TEST_F(MinHashTest, PermutedWide) { - auto input = - cudf::test::strings_column_wrapper({"doc 1", - "this is doc 2", - "doc 3", - "d", - "The quick brown fox jumpéd over the lazy brown dog."}); - - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); // below wide_string_threshold + std::string const wide(2 << 19, 'y'); // above wide_string_threshold + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; + using LCW32 = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, - LCW{ 21141582u, 580916568u, 1258052021u}, - LCW{1207251914u, 943567174u, 1109272887u}, - LCW{ 655955059u, 488346356u, 2394664816u}, - LCW{ 86520422u, 236622901u, 102546228u}}); + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u}, + LCW32{1293098788u, 2860992281u, 133918478u} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off - LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, - LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, - LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, - LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, - LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}}); + LCW64 expected64({ + LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul}, + LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeedWithNullInputRow) +TEST_F(MinHashTest, PermutedManyParameters) { - auto validity = cudf::test::iterators::null_at(1); - auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); + std::string const wide(2 << 19, 'y'); + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + // more than params_per_thread + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, - validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u, 1777049372u, 360410720u, 3238739364u, 1822100712u, 405462060u, + 3283790704u, 1867152052u, 450513400u, 3328842044u, 1912203392u, 495564740u, 3373893384u, 1957254732u, + 540616080u, 3418944724u, 2002306072u, 585667420u, 3463996064u, 2047357412u, 630718760u, 3509047404u, + 2092408752u, 675770100u, 3554098744u, 2137460092u, 720821440u, 3599150084u, 2182511432u}, + LCW32{1293098788u, 2860992281u, 133918478u, 1701811971u, 3269705464u, 542631661u, 2110525154u, 3678418647u, + 951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u, 200877717u, 1768771210u, 3336664703u, + 609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u, + 267836956u, 1835730449u, 3403623942u, 676550139u, 2244443632u, 3812337125u, 1085263322u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + // more than params_per_thread + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul}, - LCW64{}, - LCW64{0ul, 0ul}, - LCW64{2717781266371273264ul, 6977325820868387259ul}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - -TEST_F(MinHashTest, WordsMinHash) -{ - using LCWS = cudf::test::lists_column_wrapper; - auto validity = cudf::test::iterators::null_at(1); - - LCWS input( - {LCWS({"hello", "abcdéfgh"}), - LCWS{}, - LCWS({"rapids", "moré", "test", "text"}), - LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, - validity); - - auto view = cudf::lists_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); - using LCW32 = cudf::test::lists_column_wrapper; - LCW32 expected({LCW32{2069617641u, 1975382903u}, - LCW32{}, - LCW32{657297235u, 1010955999u}, - LCW32{644643885u, 310002789u}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); - using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, - LCW64{}, - LCW64{5331949571924938590ul, 2088583894581919741ul}, - LCW64{3400468157617183341ul, 2398577492366130055ul}}, - validity); + // clang-format off + LCW64 expected64({ + LCW64{1818322427062143853, 641024893347719371, 1769570368846988848, 592272835132564366, + 1720818310631833835, 543520776917409353, 1672066252416678822, 494768718702254348, + 1623314194201523817, 446016660487099335, 1574562135986368804, 397264602271944322, + 1525810077771213799, 348512544056789317, 1477058019556058786, 299760485841634304, + 1428305961340903773, 251008427626479291, 1379553903125748768, 202256369411324286, + 1330801844910593755, 153504311196169273, 1282049786695438742, 104752252981014268, + 1233297728480283737, 56000194765859255, 1184545670265128724, 7248136550704242, + 1135793612049973719, 2264339087549243188, 1087041553834818706}, + LCW64{1389920339306667795, 421787002125838902, 1759496674158703968, 791363336977875075, + 2129073009010740141, 1160939671829911248, 192806334649082363, 1530516006681947421, + 562382669501118536, 1900092341533983602, 931959004353154709, 2269668676386019775, + 1301535339205190882, 333402002024361997, 1671111674057227055, 702978336876398170, + 2040688008909263228, 1072554671728434343, 104421334547605450, 1442131006580470516, + 473997669399641631, 1811707341432506689, 843574004251677804, 2181283676284542862, + 1213150339103713977, 245017001922885084, 1582726673955750150, 614593336774921257, + 1952303008807786323, 984169671626957438, 16036334446128545} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto results = nvtext::minhash(view); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); - results = nvtext::minhash64(view); + auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + results = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -184,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); - EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); - auto seeds64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument); + auto empty = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); + auto empty64 = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); view = cudf::strings_column_view(input); auto const zeroes = thrust::constant_iterator(0); - seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); - seeds64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error); + auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), + std::overflow_error); + + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + std::invalid_argument); } diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5e39cafa47b..25cfcf99ca6 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t + from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column @@ -17,6 +19,19 @@ def minhash(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def minhash64(Column input, Column seeds, int width=4): result = nvtext.minhash.minhash64( @@ -27,6 +42,19 @@ def minhash64(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash64_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def word_minhash(Column input, Column seeds): result = nvtext.minhash.word_minhash( diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index ffa5e603408..4c0ec2d9ac5 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,6 +9,8 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, + minhash64_permuted, + minhash_permuted, word_minhash, word_minhash64, ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 856ce0f75de..3d70b01b7e4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5350,11 +5350,65 @@ def minhash( libstrings.minhash(self._column, seeds_column, width) ) + def minhash_permuted( + self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book']) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + 0 [1305480171, 462824409, 74608232] + 1 [32665388, 65330773, 97996158] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def minhash64( self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. This function generates 2 uint64 values but only the first uint64 value is used. @@ -5390,6 +5444,59 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def minhash64_permuted( + self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + 0 [172452388517576012, 316595762085180527] + 1 [71427536958126239, 58787297728258215] + 2 [423885828176437114, 1140588505926961370] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash64_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ Compute the minhash of a list column of strings. diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 997ca357986..47e541fdcef 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,68 +882,48 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash(): +def test_minhash_permuted(): strings = cudf.Series(["this is my", "favorite book", None, ""]) + params = cudf.Series([1, 2, 3], dtype=np.uint32) expected = cudf.Series( [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - None, - cudf.Series([0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32), + cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32), None, cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash(seeds=seeds, width=5) + actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) - expected = cudf.Series( - [ - cudf.Series([3232308021562742685], dtype=np.uint64), - cudf.Series([23008204270530356], dtype=np.uint64), - None, - cudf.Series([0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + params = cudf.Series([1, 2, 3], dtype=np.uint64) expected = cudf.Series( [ cudf.Series( - [7082801294247314046, 185949556058924788, 167570629329462454], + [105531920695060180, 172452388517576009, 316595762085180524], dtype=np.uint64, ), cudf.Series( - [382665377781028452, 86243762733551437, 7688750597953083512], + [35713768479063122, 71427536958126236, 58787297728258212], dtype=np.uint64, ), None, cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64(seeds=seeds, width=5) + actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash(seeds="a") + strings.str.minhash_permuted(1, a="a", b="b", width=7) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_permuted(1, a=params, b=params, width=6) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_permuted(1, a=params, b=params, width=8) def test_word_minhash(): diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 41250037dcf..ebf8eda1ce3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -22,6 +22,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash_permuted( + const column_view &strings, + const uint32_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] minhash64( const column_view &strings, const column_view &seeds, @@ -34,6 +42,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash64_permuted( + const column_view &strings, + const uint64_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 97e8c9dc83c..6b544282f44 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -11,8 +11,24 @@ ctypedef fused ColumnOrScalar: cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +) + cpdef Column word_minhash(Column input, Column seeds) cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index f1e012e60e5..5a51e32b287 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + minhash64_permuted as cpp_minhash64_permuted, + minhash_permuted as cpp_minhash_permuted, word_minhash as cpp_word_minhash, word_minhash64 as cpp_word_minhash64, ) @@ -16,6 +18,7 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from cython.operator import dereference +import warnings cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): @@ -40,6 +43,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -55,6 +64,50 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint32_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): """ Returns the minhash values for each string per seed. @@ -77,6 +130,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash64_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -92,6 +151,50 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint64_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash64_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column word_minhash(Column input, Column seeds): """ Returns the minhash values for each row of strings per seed. diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ead9ee094af..ec533e64307 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -21,15 +21,19 @@ def word_minhash_input_data(request): @pytest.mark.parametrize("width", [5, 12]) -def test_minhash(minhash_input_data, width): +def test_minhash_permuted(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash + plc.nvtext.minhash.minhash_permuted if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64 + else plc.nvtext.minhash.minhash64_permuted ) result = minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + plc.interop.from_arrow(input_arr), + 0, + plc.interop.from_arrow(seeds), + plc.interop.from_arrow(seeds), + width, ) pa_result = plc.interop.to_arrow(result) assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) From 7682edbfd418cf30c0f5494dbed36a5dbb102c06 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 12 Nov 2024 15:57:36 +0000 Subject: [PATCH 36/40] Add type stubs for pylibcudf (#17258) Having looked at a bunch of the automation options, I just did it by hand. A followup will add some automation to add docstrings (so we can see those via LSP integration in editors) and do some simple validation. - Closes #15190 Authors: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17258 --- docs/cudf/source/conf.py | 73 ++++++- docs/cudf/source/developer_guide/pylibcudf.md | 73 ++++++- python/cudf/cudf/_lib/labeling.pyx | 4 +- python/cudf/cudf/_lib/lists.pyx | 24 +-- .../cudf_polars/containers/dataframe.py | 2 +- .../cudf_polars/dsl/expressions/datetime.py | 4 +- .../cudf_polars/dsl/expressions/literal.py | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 2 +- python/pylibcudf/pylibcudf/aggregation.pyi | 110 +++++++++++ python/pylibcudf/pylibcudf/aggregation.pyx | 34 ++++ python/pylibcudf/pylibcudf/binaryop.pyi | 54 +++++ python/pylibcudf/pylibcudf/binaryop.pyx | 1 + python/pylibcudf/pylibcudf/column.pyi | 48 +++++ python/pylibcudf/pylibcudf/column.pyx | 5 + .../pylibcudf/pylibcudf/column_factories.pyi | 20 ++ .../pylibcudf/pylibcudf/column_factories.pyx | 9 + python/pylibcudf/pylibcudf/concatenate.pyi | 8 + python/pylibcudf/pylibcudf/concatenate.pyx | 1 + .../pylibcudf/pylibcudf/contiguous_split.pyi | 14 ++ .../pylibcudf/pylibcudf/contiguous_split.pyx | 11 ++ python/pylibcudf/pylibcudf/copying.pyi | 54 +++++ python/pylibcudf/pylibcudf/copying.pyx | 17 ++ python/pylibcudf/pylibcudf/datetime.pyi | 45 +++++ python/pylibcudf/pylibcudf/datetime.pyx | 18 ++ python/pylibcudf/pylibcudf/experimental.pyi | 5 + python/pylibcudf/pylibcudf/experimental.pyx | 2 + python/pylibcudf/pylibcudf/expressions.pyi | 79 ++++++++ python/pylibcudf/pylibcudf/expressions.pyx | 12 +- python/pylibcudf/pylibcudf/filling.pyi | 17 ++ python/pylibcudf/pylibcudf/filling.pyx | 8 + python/pylibcudf/pylibcudf/gpumemoryview.pyi | 9 + python/pylibcudf/pylibcudf/gpumemoryview.pyx | 3 + python/pylibcudf/pylibcudf/groupby.pyi | 38 ++++ python/pylibcudf/pylibcudf/groupby.pyx | 6 + python/pylibcudf/pylibcudf/hashing.pyi | 18 ++ python/pylibcudf/pylibcudf/hashing.pyx | 13 ++ python/pylibcudf/pylibcudf/interop.pyi | 52 +++++ python/pylibcudf/pylibcudf/interop.pyx | 8 + python/pylibcudf/pylibcudf/io/__init__.py | 16 ++ python/pylibcudf/pylibcudf/io/avro.pyi | 11 ++ python/pylibcudf/pylibcudf/io/avro.pyx | 2 + python/pylibcudf/pylibcudf/io/csv.pyi | 54 +++++ python/pylibcudf/pylibcudf/io/csv.pyx | 2 + python/pylibcudf/pylibcudf/io/datasource.pyi | 4 + python/pylibcudf/pylibcudf/io/datasource.pyx | 2 + python/pylibcudf/pylibcudf/io/json.pyi | 50 +++++ python/pylibcudf/pylibcudf/io/json.pyx | 1 + python/pylibcudf/pylibcudf/io/orc.pyi | 41 ++++ python/pylibcudf/pylibcudf/io/orc.pyx | 10 + python/pylibcudf/pylibcudf/io/parquet.pyi | 36 ++++ python/pylibcudf/pylibcudf/io/parquet.pyx | 4 + .../pylibcudf/io/parquet_metadata.pyx | 9 +- python/pylibcudf/pylibcudf/io/text.pyx | 9 + python/pylibcudf/pylibcudf/io/timezone.pyi | 7 + python/pylibcudf/pylibcudf/io/timezone.pyx | 1 + python/pylibcudf/pylibcudf/io/types.pyi | 97 +++++++++ python/pylibcudf/pylibcudf/io/types.pyx | 18 ++ python/pylibcudf/pylibcudf/join.pyi | 78 ++++++++ python/pylibcudf/pylibcudf/join.pyx | 18 ++ python/pylibcudf/pylibcudf/json.pyi | 23 +++ python/pylibcudf/pylibcudf/json.pyx | 3 + python/pylibcudf/pylibcudf/labeling.pxd | 4 +- python/pylibcudf/pylibcudf/labeling.pyi | 17 ++ python/pylibcudf/pylibcudf/labeling.pyx | 24 +-- .../pylibcudf/libcudf/CMakeLists.txt | 1 + .../pylibcudf/libcudf/lists/CMakeLists.txt | 23 +++ .../pylibcudf/libcudf/lists/combine.pxd | 8 +- .../pylibcudf/libcudf/lists/combine.pyx | 0 .../pylibcudf/libcudf/lists/contains.pyx | 0 python/pylibcudf/pylibcudf/lists.pxd | 30 ++- python/pylibcudf/pylibcudf/lists.pyi | 70 +++++++ python/pylibcudf/pylibcudf/lists.pyx | 185 ++++++++---------- python/pylibcudf/pylibcudf/merge.pyi | 11 ++ python/pylibcudf/pylibcudf/merge.pyx | 1 + python/pylibcudf/pylibcudf/null_mask.pyi | 14 ++ python/pylibcudf/pylibcudf/null_mask.pyx | 7 + .../pylibcudf/nvtext/byte_pair_encode.pyi | 11 ++ .../pylibcudf/nvtext/byte_pair_encode.pyx | 3 + .../pylibcudf/nvtext/edit_distance.pyi | 6 + .../pylibcudf/nvtext/edit_distance.pyx | 1 + .../pylibcudf/nvtext/generate_ngrams.pyi | 10 + .../pylibcudf/nvtext/generate_ngrams.pyx | 5 + python/pylibcudf/pylibcudf/nvtext/jaccard.pyi | 5 + python/pylibcudf/pylibcudf/nvtext/jaccard.pyx | 1 + python/pylibcudf/pylibcudf/nvtext/minhash.pyi | 13 ++ python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 6 + .../pylibcudf/nvtext/ngrams_tokenize.pyi | 8 + .../pylibcudf/nvtext/ngrams_tokenize.pyx | 1 + .../pylibcudf/pylibcudf/nvtext/normalize.pyi | 6 + .../pylibcudf/pylibcudf/nvtext/normalize.pyx | 1 + python/pylibcudf/pylibcudf/nvtext/replace.pyi | 17 ++ python/pylibcudf/pylibcudf/nvtext/replace.pyx | 1 + python/pylibcudf/pylibcudf/nvtext/stemmer.pyi | 8 + python/pylibcudf/pylibcudf/nvtext/stemmer.pyx | 1 + .../pylibcudf/nvtext/subword_tokenize.pyi | 15 ++ .../pylibcudf/nvtext/subword_tokenize.pyx | 3 + .../pylibcudf/pylibcudf/nvtext/tokenize.pyi | 26 +++ .../pylibcudf/pylibcudf/nvtext/tokenize.pyx | 12 ++ python/pylibcudf/pylibcudf/partitioning.pyi | 14 ++ python/pylibcudf/pylibcudf/partitioning.pyx | 5 + python/pylibcudf/pylibcudf/py.typed | 0 python/pylibcudf/pylibcudf/quantiles.pyi | 23 +++ python/pylibcudf/pylibcudf/quantiles.pyx | 1 + python/pylibcudf/pylibcudf/reduce.pyi | 16 ++ python/pylibcudf/pylibcudf/reduce.pyx | 1 + python/pylibcudf/pylibcudf/replace.pyi | 29 +++ python/pylibcudf/pylibcudf/replace.pyx | 8 + python/pylibcudf/pylibcudf/reshape.pyi | 7 + python/pylibcudf/pylibcudf/reshape.pyx | 1 + python/pylibcudf/pylibcudf/rolling.pyi | 12 ++ python/pylibcudf/pylibcudf/rolling.pyx | 1 + python/pylibcudf/pylibcudf/round.pyi | 15 ++ python/pylibcudf/pylibcudf/round.pyx | 1 + python/pylibcudf/pylibcudf/scalar.pyi | 10 + python/pylibcudf/pylibcudf/scalar.pyx | 4 + python/pylibcudf/pylibcudf/search.pyi | 19 ++ python/pylibcudf/pylibcudf/search.pyx | 1 + python/pylibcudf/pylibcudf/sorting.pyi | 64 ++++++ python/pylibcudf/pylibcudf/sorting.pyx | 12 ++ .../pylibcudf/pylibcudf/stream_compaction.pxd | 2 + .../pylibcudf/pylibcudf/stream_compaction.pyi | 53 +++++ .../pylibcudf/pylibcudf/stream_compaction.pyx | 12 ++ .../pylibcudf/pylibcudf/strings/__init__.py | 4 +- .../pylibcudf/strings/attributes.pyi | 7 + .../pylibcudf/strings/attributes.pyx | 1 + .../pylibcudf/strings/capitalize.pyi | 12 ++ .../pylibcudf/strings/capitalize.pyx | 1 + python/pylibcudf/pylibcudf/strings/case.pyi | 7 + python/pylibcudf/pylibcudf/strings/case.pyx | 1 + .../pylibcudf/strings/char_types.pyi | 30 +++ .../pylibcudf/strings/char_types.pyx | 5 + .../pylibcudf/pylibcudf/strings/combine.pyi | 34 ++++ .../pylibcudf/pylibcudf/strings/combine.pyx | 7 + .../pylibcudf/pylibcudf/strings/contains.pyi | 14 ++ .../pylibcudf/pylibcudf/strings/contains.pyx | 1 + .../pylibcudf/strings/convert/__init__.py | 12 ++ .../strings/convert/convert_booleans.pyi | 9 + .../strings/convert/convert_booleans.pyx | 1 + .../strings/convert/convert_datetime.pyi | 12 ++ .../strings/convert/convert_datetime.pyx | 1 + .../strings/convert/convert_durations.pyi | 9 + .../strings/convert/convert_durations.pyx | 1 + .../strings/convert/convert_fixed_point.pyi | 10 + .../strings/convert/convert_fixed_point.pyx | 2 + .../strings/convert/convert_floats.pyi | 8 + .../strings/convert/convert_floats.pyx | 1 + .../strings/convert/convert_integers.pyi | 11 ++ .../strings/convert/convert_integers.pyx | 8 + .../strings/convert/convert_ipv4.pyi | 7 + .../strings/convert/convert_ipv4.pyx | 1 + .../strings/convert/convert_lists.pyi | 10 + .../strings/convert/convert_lists.pyx | 1 + .../strings/convert/convert_urls.pyi | 6 + .../strings/convert/convert_urls.pyx | 1 + .../pylibcudf/pylibcudf/strings/extract.pyi | 8 + .../pylibcudf/pylibcudf/strings/extract.pyx | 1 + python/pylibcudf/pylibcudf/strings/find.pyi | 14 ++ python/pylibcudf/pylibcudf/strings/find.pyx | 1 + .../pylibcudf/strings/find_multiple.pyi | 5 + .../pylibcudf/strings/find_multiple.pyx | 1 + .../pylibcudf/pylibcudf/strings/findall.pyi | 7 + .../pylibcudf/pylibcudf/strings/findall.pyx | 1 + .../pylibcudf/pylibcudf/strings/padding.pyi | 9 + .../pylibcudf/pylibcudf/strings/padding.pyx | 1 + .../pylibcudf/strings/regex_flags.pyi | 7 + .../pylibcudf/strings/regex_flags.pyx | 2 + .../pylibcudf/strings/regex_program.pyi | 8 + .../pylibcudf/strings/regex_program.pyx | 3 + python/pylibcudf/pylibcudf/strings/repeat.pyi | 5 + python/pylibcudf/pylibcudf/strings/repeat.pyx | 1 + .../pylibcudf/pylibcudf/strings/replace.pyi | 14 ++ .../pylibcudf/pylibcudf/strings/replace.pyx | 1 + .../pylibcudf/strings/replace_re.pyi | 27 +++ .../pylibcudf/strings/replace_re.pyx | 1 + .../pylibcudf/pylibcudf/strings/side_type.pyi | 7 + .../pylibcudf/pylibcudf/strings/side_type.pyx | 2 + python/pylibcudf/pylibcudf/strings/slice.pyi | 11 ++ python/pylibcudf/pylibcudf/strings/slice.pyx | 1 + .../pylibcudf/strings/split/__init__.py | 2 + .../pylibcudf/strings/split/partition.pyi | 8 + .../pylibcudf/strings/split/partition.pyx | 1 + .../pylibcudf/strings/split/split.pyi | 27 +++ .../pylibcudf/strings/split/split.pyx | 10 + python/pylibcudf/pylibcudf/strings/strip.pyi | 11 ++ python/pylibcudf/pylibcudf/strings/strip.pyx | 1 + .../pylibcudf/pylibcudf/strings/translate.pyi | 20 ++ .../pylibcudf/pylibcudf/strings/translate.pyx | 1 + python/pylibcudf/pylibcudf/strings/wrap.pyi | 5 + python/pylibcudf/pylibcudf/strings/wrap.pyx | 1 + python/pylibcudf/pylibcudf/table.pyi | 9 + python/pylibcudf/pylibcudf/table.pyx | 3 + .../pylibcudf/tests/test_binaryops.py | 14 -- .../pylibcudf/tests/test_labeling.py | 8 +- .../pylibcudf/pylibcudf/tests/test_lists.py | 83 ++++---- .../pylibcudf/tests/test_string_attributes.py | 2 +- python/pylibcudf/pylibcudf/traits.pyi | 23 +++ python/pylibcudf/pylibcudf/traits.pyx | 21 ++ python/pylibcudf/pylibcudf/transform.pyi | 16 ++ python/pylibcudf/pylibcudf/transform.pyx | 9 + python/pylibcudf/pylibcudf/transpose.pyi | 4 + python/pylibcudf/pylibcudf/transpose.pyx | 1 + python/pylibcudf/pylibcudf/types.pyi | 86 ++++++++ python/pylibcudf/pylibcudf/types.pyx | 16 ++ python/pylibcudf/pylibcudf/unary.pyi | 38 ++++ python/pylibcudf/pylibcudf/unary.pyx | 10 + python/pylibcudf/pyproject.toml | 23 ++- 206 files changed, 2863 insertions(+), 228 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/aggregation.pyi create mode 100644 python/pylibcudf/pylibcudf/binaryop.pyi create mode 100644 python/pylibcudf/pylibcudf/column.pyi create mode 100644 python/pylibcudf/pylibcudf/column_factories.pyi create mode 100644 python/pylibcudf/pylibcudf/concatenate.pyi create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pyi create mode 100644 python/pylibcudf/pylibcudf/copying.pyi create mode 100644 python/pylibcudf/pylibcudf/datetime.pyi create mode 100644 python/pylibcudf/pylibcudf/experimental.pyi create mode 100644 python/pylibcudf/pylibcudf/expressions.pyi create mode 100644 python/pylibcudf/pylibcudf/filling.pyi create mode 100644 python/pylibcudf/pylibcudf/gpumemoryview.pyi create mode 100644 python/pylibcudf/pylibcudf/groupby.pyi create mode 100644 python/pylibcudf/pylibcudf/hashing.pyi create mode 100644 python/pylibcudf/pylibcudf/interop.pyi create mode 100644 python/pylibcudf/pylibcudf/io/avro.pyi create mode 100644 python/pylibcudf/pylibcudf/io/csv.pyi create mode 100644 python/pylibcudf/pylibcudf/io/datasource.pyi create mode 100644 python/pylibcudf/pylibcudf/io/json.pyi create mode 100644 python/pylibcudf/pylibcudf/io/orc.pyi create mode 100644 python/pylibcudf/pylibcudf/io/parquet.pyi create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pyi create mode 100644 python/pylibcudf/pylibcudf/io/types.pyi create mode 100644 python/pylibcudf/pylibcudf/join.pyi create mode 100644 python/pylibcudf/pylibcudf/json.pyi create mode 100644 python/pylibcudf/pylibcudf/labeling.pyi create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx create mode 100644 python/pylibcudf/pylibcudf/lists.pyi create mode 100644 python/pylibcudf/pylibcudf/merge.pyi create mode 100644 python/pylibcudf/pylibcudf/null_mask.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/normalize.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/replace.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/stemmer.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/tokenize.pyi create mode 100644 python/pylibcudf/pylibcudf/partitioning.pyi create mode 100644 python/pylibcudf/pylibcudf/py.typed create mode 100644 python/pylibcudf/pylibcudf/quantiles.pyi create mode 100644 python/pylibcudf/pylibcudf/reduce.pyi create mode 100644 python/pylibcudf/pylibcudf/replace.pyi create mode 100644 python/pylibcudf/pylibcudf/reshape.pyi create mode 100644 python/pylibcudf/pylibcudf/rolling.pyi create mode 100644 python/pylibcudf/pylibcudf/round.pyi create mode 100644 python/pylibcudf/pylibcudf/scalar.pyi create mode 100644 python/pylibcudf/pylibcudf/search.pyi create mode 100644 python/pylibcudf/pylibcudf/sorting.pyi create mode 100644 python/pylibcudf/pylibcudf/stream_compaction.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/capitalize.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/case.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/char_types.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/combine.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/contains.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/extract.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/find.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/findall.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/padding.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/regex_flags.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/regex_program.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/replace.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/replace_re.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/side_type.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/slice.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/strip.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/translate.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/wrap.pyi create mode 100644 python/pylibcudf/pylibcudf/table.pyi create mode 100644 python/pylibcudf/pylibcudf/traits.pyi create mode 100644 python/pylibcudf/pylibcudf/transform.pyi create mode 100644 python/pylibcudf/pylibcudf/transpose.pyi create mode 100644 python/pylibcudf/pylibcudf/types.pyi create mode 100644 python/pylibcudf/pylibcudf/unary.pyi diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 0d463b918d3..fbb9ca4b128 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -26,16 +26,18 @@ import tempfile import warnings import xml.etree.ElementTree as ET +from enum import IntEnum +from typing import Any +import cudf from docutils.nodes import Text from packaging.version import Version -from sphinx.addnodes import pending_xref -from sphinx.highlighting import lexers -from sphinx.ext import intersphinx from pygments.lexer import RegexLexer from pygments.token import Text as PText - -import cudf +from sphinx.addnodes import pending_xref +from sphinx.ext import intersphinx +from sphinx.ext.autodoc import ClassDocumenter, bool_option +from sphinx.highlighting import lexers class PseudoLexer(RegexLexer): @@ -342,7 +344,10 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), - "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), + "DeviceBuffer": ( + "rmm.pylibrmm.device_buffer.DeviceBuffer", + "rmm.DeviceBuffer", + ), } @@ -373,7 +378,14 @@ def _generate_namespaces(namespaces): _all_namespaces = _generate_namespaces( { # Note that io::datasource is actually a nested class - "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"}, + "cudf": { + "io", + "io::datasource", + "strings", + "ast", + "ast::expression", + "io::text", + }, "numeric": {}, "nvtext": {}, } @@ -642,9 +654,54 @@ def linkcode_resolve(domain, info) -> str | None: f"branch-{version}/python/cudf/cudf/{fn}{linespec}" ) + # Needed for avoid build warning for PandasCompat extension suppress_warnings = ["myst.domains"] + +class PLCIntEnumDocumenter(ClassDocumenter): + objtype = "enum" + directivetype = "attribute" + priority = 10 + ClassDocumenter.priority + + option_spec = dict(ClassDocumenter.option_spec) + + @classmethod + def can_document_member( + cls, member: Any, membername: str, isattr: bool, parent: Any + ) -> bool: + try: + return issubclass( + member, IntEnum + ) and member.__module__.startswith("pylibcudf") + except TypeError: + return False + + def add_directive_header(self, sig: str) -> None: + self.directivetype = "attribute" + super().add_directive_header(sig) + + def add_content(self, more_content) -> None: + doc_as_attr = self.doc_as_attr + self.doc_as_attr = False + super().add_content(more_content) + self.doc_as_attr = doc_as_attr + source_name = self.get_sourcename() + enum_object: IntEnum = self.object + + if self.object.__name__ != "Kind": + self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name) + self.add_line("", source_name) + self.add_line("Enum members", source_name) + self.add_line("", source_name) + + for the_member_name in enum_object.__members__: # type: ignore[attr-defined] + self.add_line( + f"* ``{the_member_name}``", source_name + ) + self.add_line("", source_name) + + def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") app.add_js_file( @@ -652,3 +709,5 @@ def setup(app): ) app.connect("doctree-read", resolve_aliases) app.connect("missing-reference", on_missing_reference) + app.setup_extension("sphinx.ext.autodoc") + app.add_autodocumenter(PLCIntEnumDocumenter) diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 39840e72e21..1ee828e7c4e 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing. - All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards. - Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies. - +- Type stubs are provided and generated manually. When adding new + functionality, ensure that the matching type stub is appropriately updated. ## Relationship to libcudf @@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`. Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side. + +### Type stubs + +Since static type checkers like `mypy` and `pyright` cannot parse +Cython code, we provide type stubs for the pylibcudf package. These +are currently maintained manually, alongside the matching pylibcudf +files. + +Every `pyx` file should have a matching `pyi` file that provides the +type stubs. Most functions can be exposed straightforwardly. Some +guiding principles: + +- For typed integer arguments in libcudf, use `int` as a type + annotation. +- For functions which are annotated as a `list` in Cython, but the + function body does more detailed checking, try and encode the + detailed information in the type. +- For Cython fused types there are two options: + 1. If the fused type appears only once in the function signature, + use a `Union` type; + 2. If the fused type appears more than once (or as both an input + and output type), use a `TypeVar` with + the variants in the fused type provided as constraints. + + +As an example, `pylibcudf.copying.split` is typed in Cython as: + +```cython +ctypedef fused ColumnOrTable: + Table + Column + +cpdef list split(ColumnOrTable input, list splits): ... +``` + +Here we only have a single use of the fused type, and the `list` +arguments do not specify their values. Here, if we provide a `Column` +as input, we receive a `list[Column]` as output, and if we provide a +`Table` we receive `list[Table]` as output. + +In the type stub, we can encode this with a `TypeVar`, we can also +provide typing for the `splits` argument that indicates that the split +values must be integers: + +```python +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +``` + +Conversely, `pylibcudf.copying.scatter` uses a fused type only once in +its input: + +```cython +ctypedef fused TableOrListOfScalars: + Table + list + +cpdef Table scatter( + TableOrListOfScalars source, Column scatter_map, Table target +) +``` + +In the type stub, we can use a normal union in this case + +```python +def scatter( + source: Table | list[Scalar], scatter_map: Column, target: Table +) -> Table: ... +``` diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx index 3966cce8981..524bfd3b2e8 100644 --- a/python/cudf/cudf/_lib/labeling.pyx +++ b/python/cudf/cudf/_lib/labeling.pyx @@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive, plc_column = plc.labeling.label_bins( input.to_pylibcudf(mode="read"), left_edges.to_pylibcudf(mode="read"), - left_inclusive, + plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO, right_edges.to_pylibcudf(mode="read"), - right_inclusive + plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO, ) return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index a91d44274e5..9a2aa4a6130 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.types cimport ( + nan_equality, null_equality, null_order, order, size_type +) from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( plc.lists.distinct( col.to_pylibcudf(mode="read"), - nulls_equal, - nans_all_equal, + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL, + nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL, ) ) @@ -48,12 +50,8 @@ def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( plc.lists.sort_lists( col.to_pylibcudf(mode="read"), - ascending, - ( - plc.types.NullOrder.BEFORE - if na_position == "first" - else plc.types.NullOrder.AFTER - ), + order.ASCENDING if ascending else order.DESCENDING, + null_order.BEFORE if na_position == "first" else null_order.AFTER, False, ) ) @@ -95,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key): plc.lists.index_of( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, - True, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -106,7 +104,7 @@ def index_of_column(Column col, Column search_keys): plc.lists.index_of( col.to_pylibcudf(mode="read"), search_keys.to_pylibcudf(mode="read"), - True, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -127,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False): return Column.from_pylibcudf( plc.lists.concatenate_list_elements( input_column.to_pylibcudf(mode="read"), - dropna, + plc.lists.ConcatenateNullPolicy.IGNORE + if dropna + else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, ) ) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 08bc9d0ea3f..7560a0f5a64 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame: # To guarantee we produce correct names, we therefore # serialise with names we control and rename with that map. name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)} - table: pa.Table = plc.interop.to_arrow( + table = plc.interop.to_arrow( self.table, [plc.interop.ColumnMetadata(name=name) for name in name_map], ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index 65fa4bfa62f..cd8e5c6a4eb 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -27,7 +27,9 @@ class TemporalFunction(Expr): __slots__ = ("name", "options") - _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + _COMPONENT_MAP: ClassVar[ + dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent] + ] = { pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py index c16313bf83c..7eba0c110ab 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py @@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: class LiteralColumn(Expr): __slots__ = ("value",) _non_child = ("dtype", "value") - value: pa.Array[Any, Any] + value: pa.Array[Any] def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: self.dtype = dtype diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index beea5908e56..1f935190f28 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -517,7 +517,7 @@ def do_evaluate( # Mask must have been applied. return df elif typ == "ndjson": - json_schema: list[tuple[str, str, list]] = [ + json_schema: list[plc.io.json.NameAndType] = [ (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( diff --git a/python/pylibcudf/pylibcudf/aggregation.pyi b/python/pylibcudf/pylibcudf/aggregation.pyi new file mode 100644 index 00000000000..a59e2a9dc93 --- /dev/null +++ b/python/pylibcudf/pylibcudf/aggregation.pyi @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.types import ( + DataType, + Interpolation, + NanEquality, + NullEquality, + NullOrder, + NullPolicy, + Order, +) + +class Kind(IntEnum): + SUM = ... + PRODUCT = ... + MIN = ... + MAX = ... + COUNT_VALID = ... + COUNT_ALL = ... + ANY = ... + ALL = ... + SUM_OF_SQUARES = ... + MEAN = ... + VARIANCE = ... + STD = ... + MEDIAN = ... + QUANTILE = ... + ARGMAX = ... + ARGMIN = ... + NUNIQUE = ... + NTH_ELEMENT = ... + RANK = ... + COLLECT_LIST = ... + COLLECT_SET = ... + PTX = ... + CUDA = ... + CORRELATION = ... + COVARIANCE = ... + +class CorrelationType(IntEnum): + PEARSON = ... + KENDALL = ... + SPEARMAN = ... + +class EWMHistory(IntEnum): + INFINITE = ... + FINITE = ... + +class RankMethod(IntEnum): + FIRST = ... + AVERAGE = ... + MIN = ... + MAX = ... + DENSE = ... + +class RankPercentage(IntEnum): + NONE = ... + ZERO_NORMALIZED = ... + ONE_NORMALIZED = ... + +class UdfType(IntEnum): + CUDA = ... + PTX = ... + +class Aggregation: + def __init__(self): ... + def kind(self) -> Kind: ... + +def sum() -> Aggregation: ... +def product() -> Aggregation: ... +def min() -> Aggregation: ... +def max() -> Aggregation: ... +def count(null_handling: NullPolicy = NullPolicy.INCLUDE) -> Aggregation: ... +def any() -> Aggregation: ... +def all() -> Aggregation: ... +def sum_of_squares() -> Aggregation: ... +def mean() -> Aggregation: ... +def variance(ddof: int = 1) -> Aggregation: ... +def std(ddof: int = 1) -> Aggregation: ... +def median() -> Aggregation: ... +def quantile( + quantiles: list[float], interp: Interpolation = Interpolation.LINEAR +) -> Aggregation: ... +def argmax() -> Aggregation: ... +def argmin() -> Aggregation: ... +def ewma(center_of_mass: float, history: EWMHistory) -> Aggregation: ... +def nunique(null_handling: NullPolicy = NullPolicy.EXCLUDE) -> Aggregation: ... +def nth_element( + n: int, null_handling: NullPolicy = NullPolicy.INCLUDE +) -> Aggregation: ... +def collect_list( + null_handling: NullPolicy = NullPolicy.INCLUDE, +) -> Aggregation: ... +def collect_set( + null_handling: NullPolicy = NullPolicy.INCLUDE, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Aggregation: ... +def udf(operation: str, output_type: DataType) -> Aggregation: ... +def correlation(type: CorrelationType, min_periods: int) -> Aggregation: ... +def covariance(min_periods: int, ddof: int) -> Aggregation: ... +def rank( + method: RankMethod, + column_order: Order = Order.ASCENDING, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + null_precedence: NullOrder = NullOrder.AFTER, + percentage: RankPercentage = RankPercentage.NONE, +) -> Aggregation: ... diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx index e510b738f70..662f76d5c8e 100644 --- a/python/pylibcudf/pylibcudf/aggregation.pyx +++ b/python/pylibcudf/pylibcudf/aggregation.pyx @@ -64,6 +64,40 @@ from pylibcudf.libcudf.aggregation import udf_type as UdfType # no-cython-lint from .types cimport DataType +__all__ = [ + "Aggregation", + "CorrelationType", + "EWMHistory", + "Kind", + "RankMethod", + "RankPercentage", + "UdfType", + "all", + "any", + "argmax", + "argmin", + "collect_list", + "collect_set", + "correlation", + "count", + "covariance", + "ewma", + "max", + "mean", + "median", + "min", + "nth_element", + "nunique", + "product", + "quantile", + "rank", + "std", + "sum", + "sum_of_squares", + "udf", + "variance", +] + cdef class Aggregation: """A type of aggregation to perform. diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi new file mode 100644 index 00000000000..f745e6c6854 --- /dev/null +++ b/python/pylibcudf/pylibcudf/binaryop.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class BinaryOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PMOD = ... + PYMOD = ... + POW = ... + INT_POW = ... + LOG_BASE = ... + ATAN2 = ... + SHIFT_LEFT = ... + SHIFT_RIGHT = ... + SHIFT_RIGHT_UNSIGNED = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + LOGICAL_AND = ... + LOGICAL_OR = ... + EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + NULL_EQUALS = ... + NULL_MAX = ... + NULL_MIN = ... + NULL_NOT_EQUALS = ... + GENERIC_BINARY = ... + NULL_LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + INVALID_BINARY = ... + +def binary_operation( + lhs: Column | Scalar, + rhs: Column | Scalar, + op: BinaryOperator, + output_type: DataType, +) -> Column: ... +def is_supported_operation( + out: DataType, lhs: DataType, rhs: DataType, op: BinaryOperator +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index eef73bf4e9d..b7b4ecc6e83 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -16,6 +16,7 @@ from .column cimport Column from .scalar cimport Scalar from .types cimport DataType +__all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"] cpdef Column binary_operation( LeftBinaryOperand lhs, diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi new file mode 100644 index 00000000000..c9f70de3dbf --- /dev/null +++ b/python/pylibcudf/pylibcudf/column.pyi @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence +from typing import Any + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class Column: + def __init__( + self, + data_type: DataType, + size: int, + data: gpumemoryview | None, + mask: gpumemoryview | None, + null_count: int, + offset: int, + children: list[Column], + ) -> None: ... + def type(self) -> DataType: ... + def child(self, index: int) -> Column: ... + def size(self) -> int: ... + def null_count(self) -> int: ... + def offset(self) -> int: ... + def data(self) -> gpumemoryview | None: ... + def null_mask(self) -> gpumemoryview | None: ... + def children(self) -> list[Column]: ... + def copy(self) -> Column: ... + def with_mask( + self, mask: gpumemoryview | None, null_count: int + ) -> Column: ... + def list_view(self) -> ListColumnView: ... + @staticmethod + def from_scalar(scalar: Scalar, size: int) -> Column: ... + @staticmethod + def all_null_like(like: Column, size: int) -> Column: ... + @staticmethod + def from_cuda_array_interface_obj(obj: Any) -> Column: ... + +class ListColumnView: + def __init__(self, column: Column): ... + def child(self) -> Column: ... + def offsets(self) -> Column: ... + +def is_c_contiguous( + shape: Sequence[int], strides: Sequence[int], itemsize: int +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index 4e5698566d0..9bb5574608e 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -17,6 +17,7 @@ from .utils cimport int_to_bitmask_ptr, int_to_void_ptr import functools +__all__ = ["Column", "ListColumnView", "is_c_contiguous"] cdef class Column: """A container of nullable device data as a column of elements. @@ -61,6 +62,8 @@ cdef class Column: self._children = children self._num_children = len(children) + __hash__ = None + cdef column_view view(self) nogil: """Generate a libcudf column_view to pass to libcudf algorithms. @@ -384,6 +387,8 @@ cdef class ListColumnView: raise TypeError("Column is not a list type") self._column = col + __hash__ = None + cpdef child(self): """The data column of the underlying list column.""" return self._column.child(1) diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi new file mode 100644 index 00000000000..c87fe423acb --- /dev/null +++ b/python/pylibcudf/pylibcudf/column_factories.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.types import DataType, MaskState, TypeId + +def make_empty_column(type_or_id: DataType | TypeId) -> Column: ... +def make_numeric_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_point_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_timestamp_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_duration_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_width_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index ac942a620b5..c4969a7f502 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -17,6 +17,15 @@ from .types cimport DataType, type_id from .types import MaskState, TypeId +__all__ = [ + "make_duration_column", + "make_empty_column", + "make_fixed_point_column", + "make_fixed_width_column", + "make_numeric_column", + "make_timestamp_column", +] + cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): """Creates an empty column of the specified type. diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi new file mode 100644 index 00000000000..79076f509e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/concatenate.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def concatenate[ColumnOrTable: (Column, Table)]( + objects: list[ColumnOrTable], +) -> ColumnOrTable: ... diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx index 10c860d97bb..42c5f34cf3e 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyx +++ b/python/pylibcudf/pylibcudf/concatenate.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["concatenate"] cpdef concatenate(list objects): """Concatenate columns or tables. diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyi b/python/pylibcudf/pylibcudf/contiguous_split.pyi new file mode 100644 index 00000000000..dd6328fbf23 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table + +class PackedColumns: + def __init__(self): ... + def release(self) -> tuple[memoryview, gpumemoryview]: ... + +def pack(input: Table) -> PackedColumns: ... +def unpack(input: PackedColumns) -> Table: ... +def unpack_from_memoryviews( + metadata: memoryview, gpu_data: gpumemoryview +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index ed926a3fcc0..94873e079c9 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -20,6 +20,13 @@ from .table cimport Table from .utils cimport int_to_void_ptr +__all__ = [ + "PackedColumns", + "pack", + "unpack", + "unpack_from_memoryviews", +] + cdef class HostBuffer: """Owning host buffer that implements the buffer protocol""" cdef unique_ptr[vector[uint8_t]] c_obj @@ -38,6 +45,8 @@ cdef class HostBuffer: out.strides[0] = 1 return out + __hash__ = None + def __getbuffer__(self, Py_buffer *buffer, int flags): buffer.buf = dereference(self.c_obj).data() buffer.format = NULL # byte @@ -69,6 +78,8 @@ cdef class PackedColumns: "Use one of the factories." ) + __hash__ = None + @staticmethod cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data): """Create a Python PackedColumns from a libcudf packed_columns.""" diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi new file mode 100644 index 00000000000..6cf4ed48724 --- /dev/null +++ b/python/pylibcudf/pylibcudf/copying.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum +from typing import TypeVar + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class MaskAllocationPolicy(IntEnum): + NEVER = ... + RETAIN = ... + ALWAYS = ... + +class OutOfBoundsPolicy(IntEnum): + NULLIFY = ... + DONT_CHECK = ... + +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def gather( + source_table: Table, gather_map: Column, bounds_policy: OutOfBoundsPolicy +) -> Table: ... +def scatter( + source: Table | list[Scalar], scatter_map: Column, target_table: Table +) -> Table: ... +def empty_like(input: ColumnOrTable) -> ColumnOrTable: ... +def allocate_like( + input_column: Column, policy: MaskAllocationPolicy, size: int | None = None +) -> Column: ... +def copy_range_in_place( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def copy_range( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def shift(input: Column, offset: int, fill_value: Scalar) -> Column: ... +def slice(input: ColumnOrTable, indices: list[int]) -> list[ColumnOrTable]: ... +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +def copy_if_else( + lhs: Column | Scalar, rhs: Column | Scalar, boolean_mask: Column +) -> Column: ... +def boolean_mask_scatter( + input: Table | list[Scalar], target: Table, boolean_mask: Column +) -> Table: ... +def get_element(input_column: Column, index: int) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index 4938f1a3dda..fb8b6f9890e 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -36,6 +36,23 @@ from .table cimport Table from .utils cimport _as_vector +__all__ = [ + "MaskAllocationPolicy", + "OutOfBoundsPolicy", + "allocate_like", + "boolean_mask_scatter", + "copy_if_else", + "copy_range", + "copy_range_in_place", + "empty_like", + "gather", + "get_element", + "scatter", + "shift", + "slice", + "split", +] + cpdef Table gather( Table source_table, Column gather_map, diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi new file mode 100644 index 00000000000..6a3ae7953d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/datetime.pyi @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class DatetimeComponent(IntEnum): + YEAR = ... + MONTH = ... + DAY = ... + WEEKDAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +class RoundingFrequency(IntEnum): + DAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +def extract_millisecond_fraction(input: Column) -> Column: ... +def extract_microsecond_fraction(input: Column) -> Column: ... +def extract_nanosecond_fraction(input: Column) -> Column: ... +def extract_datetime_component( + input: Column, component: DatetimeComponent +) -> Column: ... +def ceil_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def floor_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def round_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def add_calendrical_months( + input: Column, months: Column | Scalar +) -> Column: ... +def day_of_year(input: Column) -> Column: ... +def is_leap_year(input: Column) -> Column: ... +def last_day_of_month(input: Column) -> Column: ... +def extract_quarter(input: Column) -> Column: ... +def days_in_month(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 9e5e709d81d..b100e3e22d0 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -29,6 +29,24 @@ from cython.operator cimport dereference from .column cimport Column +__all__ = [ + "DatetimeComponent", + "RoundingFrequency", + "add_calendrical_months", + "ceil_datetimes", + "day_of_year", + "days_in_month", + "extract_datetime_component", + "extract_microsecond_fraction", + "extract_millisecond_fraction", + "extract_nanosecond_fraction", + "extract_quarter", + "floor_datetimes", + "is_leap_year", + "last_day_of_month", + "round_datetimes", +] + cpdef Column extract_millisecond_fraction( Column input ): diff --git a/python/pylibcudf/pylibcudf/experimental.pyi b/python/pylibcudf/pylibcudf/experimental.pyi new file mode 100644 index 00000000000..bbfb86b0ff6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/experimental.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +def enable_prefetching(key: str) -> None: ... +def disable_prefetching(key: str) -> None: ... +def prefetch_debugging(enable: bool) -> None: ... diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx index b25a53e13b2..d94d6d087ac 100644 --- a/python/pylibcudf/pylibcudf/experimental.pyx +++ b/python/pylibcudf/pylibcudf/experimental.pyx @@ -5,6 +5,8 @@ from libcpp.string cimport string from pylibcudf.libcudf cimport experimental as cpp_experimental +__all__ = ["disable_prefetching", "enable_prefetching", "prefetch_debugging"] + cpdef enable_prefetching(str key): """Turn on prefetch instructions for the given key. diff --git a/python/pylibcudf/pylibcudf/expressions.pyi b/python/pylibcudf/pylibcudf/expressions.pyi new file mode 100644 index 00000000000..12b473d8605 --- /dev/null +++ b/python/pylibcudf/pylibcudf/expressions.pyi @@ -0,0 +1,79 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +from pylibcudf.scalar import Scalar + +class TableReference(IntEnum): + LEFT = ... + RIGHT = ... + +class ASTOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PYMOD = ... + POW = ... + EQUAL = ... + NULL_EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + NULL_LOGICAL_AND = ... + LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + LOGICAL_OR = ... + IDENTITY = ... + IS_NULL = ... + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +class Expression: + def __init__(self): ... + +class Literal(Expression): + def __init__(self, value: Scalar): ... + +class ColumnReference(Expression): + def __init__( + self, index: int, table_source: TableReference = TableReference.LEFT + ): ... + +class ColumnNameReference(Expression): + def __init__(self, name: str): ... + +class Operation(Expression): + def __init__( + self, + op: ASTOperator, + left: Expression, + right: Expression | None = None, + ): ... diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx index 1535f68366b..0f12cfe313c 100644 --- a/python/pylibcudf/pylibcudf/expressions.pyx +++ b/python/pylibcudf/pylibcudf/expressions.pyx @@ -49,6 +49,16 @@ from .types cimport DataType # Aliases for simplicity ctypedef unique_ptr[libcudf_exp.expression] expression_ptr +__all__ = [ + "ASTOperator", + "ColumnNameReference", + "ColumnReference", + "Expression", + "Literal", + "Operation", + "TableReference", +] + # Define this class just to have a docstring for it cdef class Expression: """ @@ -58,7 +68,7 @@ cdef class Expression: For details, see :cpp:class:`cudf::ast::expression`. """ - pass + __hash__ = None cdef class Literal(Expression): """ diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi new file mode 100644 index 00000000000..0b5e29bdc32 --- /dev/null +++ b/python/pylibcudf/pylibcudf/filling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def fill( + destination: Column, begin: int, end: int, value: Scalar +) -> Column: ... +def fill_in_place( + destination: Column, begin: int, end: int, value: Scalar +) -> None: ... +def sequence(size: int, init: Scalar, step: Scalar) -> Column: ... +def repeat(input_table: Table, count: Column | int) -> Table: ... +def calendrical_month_sequence( + n: int, init: Scalar, months: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index 313605ead16..ea5b45ff7c2 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -19,6 +19,14 @@ from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "fill", + "fill_in_place", + "repeat", + "sequence", + "calendrical_month_sequence", +] + cpdef Column fill( Column destination, size_type begin, diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi new file mode 100644 index 00000000000..50f1f39a515 --- /dev/null +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping +from typing import Any + +class gpumemoryview: + def __init__(self, data: Any): ... + @property + def __cuda_array_interface__(self) -> Mapping[str, Any]: ... diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx index 0904022a944..41316eddb60 100644 --- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +__all__ = ["gpumemoryview"] cdef class gpumemoryview: """Minimal representation of a memory buffer. @@ -25,3 +26,5 @@ cdef class gpumemoryview: @property def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/groupby.pyi b/python/pylibcudf/pylibcudf/groupby.pyi new file mode 100644 index 00000000000..883ad6e34cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/groupby.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.replace import ReplacePolicy +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order, Sorted + +class GroupByRequest: + def __init__( + self, values: Column, aggregations: list[Aggregation] + ) -> None: ... + +class GroupBy: + def __init__( + self, + keys: Table, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + keys_are_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, + ) -> None: ... + def aggregate( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def scan( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def shift( + self, values: Table, offset: list[int], fill_values: list[Scalar] + ) -> tuple[Table, Table]: ... + def replace_nulls( + self, value: Table, replace_policies: list[ReplacePolicy] + ) -> tuple[Table, Table]: ... + def get_groups( + self, values: Table | None = None + ) -> tuple[list[int], Table, Table]: ... diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index 71f9ecb0453..e6cb3ac81a7 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -25,6 +25,8 @@ from .types cimport null_order, null_policy, order, sorted from .utils cimport _as_vector +__all__ = ["GroupBy", "GroupByRequest"] + cdef class GroupByRequest: """A request for a groupby aggregation or scan. @@ -45,6 +47,8 @@ cdef class GroupByRequest: self._values = values self._aggregations = aggregations + __hash__ = None + cdef aggregation_request _to_libcudf_agg_request(self) except *: """Convert to a libcudf aggregation_request object. @@ -127,6 +131,8 @@ cdef class GroupBy: # deallocated from under us: self._keys = keys + __hash__ = None + @staticmethod cdef tuple _parse_outputs( pair[unique_ptr[table], vector[aggregation_result]] c_res diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi new file mode 100644 index 00000000000..a849f5d0729 --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Final + +from pylibcudf.column import Column +from pylibcudf.table import Table + +LIBCUDF_DEFAULT_HASH_SEED: Final[int] + +def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... +def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_64(input: Table, seed: int = ...) -> Column: ... +def md5(input: Table) -> Column: ... +def sha1(input: Table) -> Column: ... +def sha224(input: Table) -> Column: ... +def sha256(input: Table) -> Column: ... +def sha384(input: Table) -> Column: ... +def sha512(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 9ea3d4d1bda..548cffc0ce8 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -20,6 +20,19 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column from .table cimport Table +__all__ = [ + "LIBCUDF_DEFAULT_HASH_SEED", + "md5", + "murmurhash3_x64_128", + "murmurhash3_x86_32", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "xxhash_64", +] + LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED cpdef Column murmurhash3_x86_32( diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi new file mode 100644 index 00000000000..63de816010b --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pyi @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +from typing import Any, overload + +import pyarrow as pa + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import DataType + +@dataclass +class ColumnMetadata: + name: str = ... + children_meta: list[ColumnMetadata] = ... + +@overload +def from_arrow(obj: pa.DataType) -> DataType: ... +@overload +def from_arrow( + obj: pa.Scalar[Any], *, data_type: DataType | None = None +) -> Scalar: ... +@overload +def from_arrow(obj: pa.Array[Any]) -> Column: ... +@overload +def from_arrow(obj: pa.Table) -> Table: ... +@overload +def to_arrow( + obj: DataType, + *, + precision: int | None = None, + fields: Iterable[pa.Field[pa.DataType] | tuple[str, pa.DataType]] + | Mapping[str, pa.DataType] + | None = None, + value_type: pa.DataType | None = None, +) -> pa.DataType: ... +@overload +def to_arrow( + obj: Table, metadata: list[ColumnMetadata | str] | None = None +) -> pa.Table: ... +@overload +def to_arrow( + obj: Column, metadata: ColumnMetadata | str | None = None +) -> pa.Array[Any]: ... +@overload +def to_arrow( + obj: Scalar, metadata: ColumnMetadata | str | None = None +) -> pa.Scalar[Any]: ... +def from_dlpack(managed_tensor: Any) -> Table: ... +def to_dlpack(input: Table) -> Any: ... diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 61e812353b7..bd5397ac328 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -38,6 +38,14 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +__all__ = [ + "ColumnMetadata", + "from_arrow", + "from_dlpack", + "to_arrow", + "to_dlpack", +] + ARROW_TO_PYLIBCUDF_TYPES = { pa.int8(): type_id.INT8, pa.int16(): type_id.INT16, diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 9e8e0f6e080..f913a400684 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -13,3 +13,19 @@ types, ) from .types import SinkInfo, SourceInfo, TableWithMetadata + +__all__ = [ + "SinkInfo", + "SourceInfo", + "TableWithMetadata", + "avro", + "csv", + "datasource", + "json", + "orc", + "parquet", + "parquet_metadata", + "text", + "timezone", + "types", +] diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi new file mode 100644 index 00000000000..49c2f083702 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/avro.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +__all__ = ["read_avro"] + +def read_avro( + source_info: SourceInfo, + columns: list[str] | None = None, + skip_rows: int = 0, + num_rows: int = -1, +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index fe765b34f82..4271333511a 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.io.avro cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["read_avro"] + cpdef TableWithMetadata read_avro( SourceInfo source_info, diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi new file mode 100644 index 00000000000..356825a927d --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping + +from pylibcudf.io.types import ( + CompressionType, + QuoteStyle, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.types import DataType + +def read_csv( + source_info: SourceInfo, + *, + compression: CompressionType = CompressionType.AUTO, + byte_range_offset: int = 0, + byte_range_size: int = 0, + col_names: list[str] | None = None, + prefix: str = "", + mangle_dupe_cols: bool = True, + usecols: list[int] | list[str] | None = None, + nrows: int = -1, + skiprows: int = 0, + skipfooter: int = 0, + header: int = 0, + lineterminator: str = "\n", + delimiter: str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + delim_whitespace: bool = False, + skipinitialspace: bool = False, + skip_blank_lines: bool = True, + quoting: QuoteStyle = QuoteStyle.MINIMAL, + quotechar: str = '"', + doublequote: bool = True, + parse_dates: list[str] | list[int] | None = None, + parse_hex: list[str] | list[int] | None = None, + # Technically this should be dict/list + # but using a fused type prevents using None as default + dtypes: Mapping[str, DataType] | list[DataType] | None = None, + true_values: list[str] | None = None, + false_values: list[str] | None = None, + na_values: list[str] | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + dayfirst: bool = False, + # Note: These options are supported by the libcudf reader + # but are not exposed here since there is no demand for them + # on the Python side yet. + # detect_whitespace_around_quotes: bool = False, + # timestamp_type: DataType = DataType(type_id.EMPTY), +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 2c61cc42d82..858e580ab34 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -19,6 +19,8 @@ from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +__all__ = ["read_csv"] + cdef tuple _process_parse_dates_hex(list cols): cdef vector[string] str_cols cdef vector[int] int_cols diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyi b/python/pylibcudf/pylibcudf/io/datasource.pyi new file mode 100644 index 00000000000..e52197f793b --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/datasource.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +class Datasource: + def __init__(self): ... diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx index 02418444caa..aac1c0d1014 100644 --- a/python/pylibcudf/pylibcudf/io/datasource.pyx +++ b/python/pylibcudf/pylibcudf/io/datasource.pyx @@ -2,8 +2,10 @@ from pylibcudf.libcudf.io.datasource cimport datasource +__all__ = ["Datasource"] cdef class Datasource: + __hash__ = None cdef datasource* get_datasource(self) except * nogil: with gil: raise NotImplementedError("get_datasource() should not " diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi new file mode 100644 index 00000000000..b2bc6a43700 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from typing import TypeAlias + +from pylibcudf.column import Column +from pylibcudf.io.types import ( + CompressionType, + JSONRecoveryMode, + SinkInfo, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.types import DataType + +ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] + +NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]] + +def read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + lines: bool = False, + byte_range_offset: int = 0, + byte_range_size: int = 0, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, +) -> TableWithMetadata: ... +def write_json( + sink_info: SinkInfo, + table_w_meta: TableWithMetadata, + na_rep: str = "", + include_nulls: bool = False, + lines: bool = False, + rows_per_chunk: int = 2**32 - 1, + true_value: str = "true", + false_value: str = "false", +) -> None: ... +def chunked_read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, + chunk_size: int = 100_000_000, +) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ... diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 65f78f830f1..ad2989925c9 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -23,6 +23,7 @@ from pylibcudf.libcudf.io.types cimport ( from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +__all__ = ["chunked_read_json", "read_json", "write_json"] cdef map[string, schema_element] _generate_schema_map(list dtypes): cdef map[string, schema_element] schema_map diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi new file mode 100644 index 00000000000..4cf87f1a832 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Any + +from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.types import DataType + +def read_orc( + source_info: SourceInfo, + columns: list[str] | None = None, + stripes: list[list[int]] | None = None, + skip_rows: int = 0, + nrows: int = -1, + use_index: bool = True, + use_np_dtypes: bool = True, + timestamp_type: DataType | None = None, + decimal128_columns: list[str] | None = None, +) -> TableWithMetadata: ... + +class OrcColumnStatistics: + def __init__(self): ... + @property + def number_of_values(self) -> int | None: ... + @property + def has_null(self) -> bool | None: ... + def __getitem__(self, item: str) -> Any: ... + def __contains__(self, item: str) -> bool: ... + def get[T](self, item: str, default: None | T = None) -> T | None: ... + +class ParsedOrcStatistics: + def __init__(self): ... + @property + def column_names(self) -> list[str]: ... + @property + def file_stats(self) -> list[OrcColumnStatistics]: ... + @property + def stripes_stats(self) -> list[OrcColumnStatistics]: ... + +def read_parsed_orc_statistics( + source_info: SourceInfo, +) -> ParsedOrcStatistics: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 70e0a7995a2..4270f5b4f95 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -30,6 +30,12 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType from pylibcudf.variant cimport get_if, holds_alternative +__all__ = [ + "OrcColumnStatistics", + "ParsedOrcStatistics", + "read_orc", + "read_parsed_orc_statistics", +] cdef class OrcColumnStatistics: def __init__(self): @@ -39,6 +45,8 @@ cdef class OrcColumnStatistics: "use `OrcColumnStatistics.from_libcudf` instead." ) + __hash__ = None + @property def number_of_values(self): if self.number_of_values_c.has_value(): @@ -183,6 +191,8 @@ cdef class OrcColumnStatistics: cdef class ParsedOrcStatistics: + __hash__ = None + @property def column_names(self): return [name.decode() for name in self.c_obj.column_names] diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi new file mode 100644 index 00000000000..bcf1d1cce09 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.expressions import Expression +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +class ChunkedParquetReader: + def __init__( + self, + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + use_pandas_metadata: bool = True, + convert_strings_to_categories: bool = False, + skip_rows: int = 0, + nrows: int = 0, + chunk_read_limit: int = 0, + pass_read_limit: int = 1024000000, + allow_mismatched_pq_schemas: bool = False, + ) -> None: ... + def has_next(self) -> bool: ... + def read_chunk(self) -> TableWithMetadata: ... + +def read_parquet( + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + filters: Expression | None = None, + convert_strings_to_categories: bool = False, + use_pandas_metadata: bool = True, + skip_rows: int = 0, + nrows: int = -1, + allow_mismatched_pq_schemas: bool = False, + # disabled see comment in parquet.pyx for more + # reader_column_schema: ReaderColumnSchema = *, + # timestamp_type: DataType = * +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 981ca7b8159..b76a352d633 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -16,6 +16,8 @@ from pylibcudf.libcudf.io.parquet cimport ( from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type +__all__ = ["ChunkedParquetReader", "read_parquet"] + cdef parquet_reader_options _setup_parquet_reader_options( SourceInfo source_info, @@ -123,6 +125,8 @@ cdef class ChunkedParquetReader: ) ) + __hash__ = None + cpdef bool has_next(self): """ Returns True if there is another chunk in the Parquet file diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx index 352905ff0f8..0ad4dafb0cf 100644 --- a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx @@ -4,6 +4,13 @@ from pylibcudf.io.types cimport SourceInfo from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata +__all__ = [ + "ParquetColumnSchema", + "ParquetMetadata", + "ParquetSchema", + "read_parquet_metadata", +] + cdef class ParquetColumnSchema: """ Schema of a parquet column, including the nested columns. @@ -164,7 +171,7 @@ cdef class ParquetMetadata: Returns ------- - dict[bytes, bytes] + dict[str, str] Key value metadata as a map. """ return {key.decode(): val.decode() for key, val in self.meta.metadata()} diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx index 667a054baaa..d3cbdc4cd60 100644 --- a/python/pylibcudf/pylibcudf/io/text.pyx +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -10,6 +10,15 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.io cimport text as cpp_text +__all__ = [ + "DataChunkSource", + "ParseOptions", + "make_source", + "make_source_from_bgzip_file", + "make_source_from_file", + "multibyte_split", +] + cdef class ParseOptions: """ Parsing options for `multibyte_split` diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyi b/python/pylibcudf/pylibcudf/io/timezone.pyi new file mode 100644 index 00000000000..0582800c4af --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table + +def make_timezone_transition_table( + tzif_dir: str, timezone_name: str +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx index f120b65fb2c..af7cf8a4ee5 100644 --- a/python/pylibcudf/pylibcudf/io/timezone.pyx +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.table.table cimport table from ..table cimport Table +__all__ = ["make_timezone_transition_table"] cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): """ diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi new file mode 100644 index 00000000000..a4f4fc13bdc --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -0,0 +1,97 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import io +import os +from collections.abc import Mapping +from enum import IntEnum +from typing import Any, Literal, TypeAlias, overload + +from pylibcudf.column import Column +from pylibcudf.io.datasource import Datasource +from pylibcudf.table import Table + +class JSONRecoveryMode(IntEnum): + FAIL = ... + RECOVER_WITH_NULL = ... + +class CompressionType(IntEnum): + NONE = ... + AUTO = ... + SNAPPY = ... + GZIP = ... + BZIP2 = ... + BROTLI = ... + ZIP = ... + XZ = ... + ZLIB = ... + LZ4 = ... + LZO = ... + ZSTD = ... + +class ColumnEncoding(IntEnum): + USE_DEFAULT = ... + DICTIONARY = ... + PLAIN = ... + DELTA_BINARY_PACKED = ... + DELTA_LENGTH_BYTE_ARRAY = ... + DELTA_BYTE_ARRAY = ... + BYTE_STREAM_SPLIT = ... + DIRECT = ... + DIRECT_V2 = ... + DICTIONARY_V2 = ... + +class DictionaryPolicy(IntEnum): + NEVER = ... + ADAPTIVE = ... + ALWAYS = ... + +class StatisticsFreq(IntEnum): + STATISTICS_NONE = ... + STATISTICS_ROWGROUP = ... + STATISTICS_PAGE = ... + STATISTICS_COLUMN = ... + +class QuoteStyle(IntEnum): + MINIMAL = ... + ALL = ... + NONNUMERIC = ... + NONE = ... + +ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]] +ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec] + +class TableWithMetadata: + tbl: Table + def __init__( + self, tbl: Table, column_names: list[ColumnNameSpec] + ) -> None: ... + @property + def columns(self) -> list[Column]: ... + @overload + def column_names(self, include_children: Literal[False]) -> list[str]: ... + @overload + def column_names( + self, include_children: Literal[True] + ) -> list[ColumnNameSpec]: ... + @overload + def column_names( + self, include_children: bool = False + ) -> list[str] | list[ColumnNameSpec]: ... + @property + def child_names(self) -> ChildNameSpec: ... + @property + def per_file_user_data(self) -> list[Mapping[str, str]]: ... + +class SourceInfo: + def __init__( + self, sources: list[str] | list[os.PathLike[Any]] | list[Datasource] + ) -> None: ... + +class SinkInfo: + def __init__( + self, + sinks: list[os.PathLike[Any]] + | list[io.StringIO] + | list[io.BytesIO] + | list[io.TextIOBase] + | list[str], + ) -> None: ... diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index c129903f8f1..5db4eeb9583 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -28,9 +28,21 @@ from pylibcudf.libcudf.io.types import ( compression_type as CompressionType, # no-cython-lint column_encoding as ColumnEncoding, # no-cython-lint dictionary_policy as DictionaryPolicy, # no-cython-lint + quote_style as QuoteStyle, # no-cython-lint statistics_freq as StatisticsFreq, # no-cython-lint ) +__all__ = [ + "ColumnEncoding", + "CompressionType", + "DictionaryPolicy", + "JSONRecoveryMode", + "QuoteStyle", + "SinkInfo", + "SourceInfo", + "StatisticsFreq", + "TableWithMetadata", +] cdef class TableWithMetadata: """A container holding a table and its associated metadata @@ -54,6 +66,8 @@ cdef class TableWithMetadata: self.metadata.schema_info = self._make_column_info(column_names) + __hash__ = None + cdef vector[column_name_info] _make_column_info(self, list column_names): cdef vector[column_name_info] col_name_infos cdef column_name_info info @@ -219,6 +233,8 @@ cdef class SourceInfo: self.c_obj = source_info(c_host_buffers) + __hash__ = None + # Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you # write from cudf to any python file-like object (File/BytesIO/SocketIO etc) @@ -301,3 +317,5 @@ cdef class SinkInfo: else: # we don't have sinks so we must have paths to sinks self.c_obj = sink_info(paths) + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi new file mode 100644 index 00000000000..f34357baa67 --- /dev/null +++ b/python/pylibcudf/pylibcudf/join.pyi @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.table import Table +from pylibcudf.types import NullEquality + +def inner_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def full_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_semi_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def left_anti_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def cross_join(left: Table, right: Table) -> Table: ... +def conditional_inner_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_full_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_semi_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def conditional_left_anti_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def mixed_inner_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_full_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_semi_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... +def mixed_left_anti_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index 0d841eee194..c2efe05ffc4 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -15,6 +15,24 @@ from .column cimport Column from .expressions cimport Expression from .table cimport Table +__all__ = [ + "conditional_full_join", + "conditional_inner_join", + "conditional_left_anti_join", + "conditional_left_join", + "conditional_left_semi_join", + "cross_join", + "full_join", + "inner_join", + "left_anti_join", + "left_join", + "left_semi_join", + "mixed_full_join", + "mixed_inner_join", + "mixed_left_anti_join", + "mixed_left_join", + "mixed_left_semi_join", +] cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): # helper to convert a gather map to a Column diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi new file mode 100644 index 00000000000..b93d4876dab --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class GetJsonObjectOptions: + def __init__( + self, + *, + allow_single_quotes: bool = False, + strip_quotes_from_single_strings: bool = True, + missing_fields_as_nulls: bool = False, + ) -> None: ... + def get_allow_single_quotes(self) -> bool: ... + def get_strip_quotes_from_single_strings(self) -> bool: ... + def get_missing_fields_as_nulls(self) -> bool: ... + def set_allow_single_quotes(self, val: bool) -> None: ... + def set_strip_quotes_from_single_strings(self, val: bool) -> None: ... + def set_missing_fields_as_nulls(self, val: bool) -> None: ... + +def get_json_object( + col: Column, json_path: Scalar, options: GetJsonObjectOptions | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx index ebb82f80408..5ec1e1be971 100644 --- a/python/pylibcudf/pylibcudf/json.pyx +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.scalar cimport Scalar +__all__ = ["GetJsonObjectOptions", "get_json_object"] cdef class GetJsonObjectOptions: """Settings for ``get_json_object()``""" @@ -26,6 +27,8 @@ cdef class GetJsonObjectOptions: ) self.set_missing_fields_as_nulls(missing_fields_as_nulls) + __hash__ = None + def get_allow_single_quotes(self): """ Returns true/false depending on whether single-quotes for representing strings diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd index 6f8797ae7d3..b1f9f2e806d 100644 --- a/python/pylibcudf/pylibcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/labeling.pxd @@ -8,7 +8,7 @@ from .column cimport Column cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ) diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi new file mode 100644 index 00000000000..c3a75d10baf --- /dev/null +++ b/python/pylibcudf/pylibcudf/labeling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class Inclusive(IntEnum): + YES = ... + NO = ... + +def label_bins( + input: Column, + left_edges: Column, + left_inclusive: Inclusive, + right_edges: Column, + right_inclusive: Inclusive, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index 226a9e14172..cae1830f6b9 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -10,13 +10,14 @@ from pylibcudf.libcudf.labeling import inclusive as Inclusive # no-cython-lint from .column cimport Column +__all__ = ["Inclusive", "label_bins"] cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ): """Labels elements based on membership in the specified bins. @@ -28,11 +29,11 @@ cpdef Column label_bins( Column of input elements to label according to the specified bins. left_edges : Column Column of the left edge of each bin. - left_inclusive : bool + left_inclusive : Inclusive Whether or not the left edge is inclusive. right_edges : Column Column of the right edge of each bin. - right_inclusive : bool + right_inclusive : Inclusive Whether or not the right edge is inclusive. Returns @@ -42,24 +43,13 @@ cpdef Column label_bins( according to the specified bins. """ cdef unique_ptr[column] c_result - cdef inclusive c_left_inclusive = ( - inclusive.YES - if left_inclusive - else inclusive.NO - ) - cdef inclusive c_right_inclusive = ( - inclusive.YES - if right_inclusive - else inclusive.NO - ) - with nogil: c_result = cpp_labeling.label_bins( input.view(), left_edges.view(), - c_left_inclusive, + left_inclusive, right_edges.view(), - c_right_inclusive, + right_inclusive, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 15beaee47d4..00669ff579a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -24,4 +24,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) add_subdirectory(io) +add_subdirectory(lists) add_subdirectory(strings) diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt new file mode 100644 index 00000000000..c896db2c85a --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources combine.pyx contains.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_lists +) diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd index d077958ce03..09a5d84c64f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,10 +10,9 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: - ctypedef enum concatenate_null_policy: - IGNORE "cudf::lists::concatenate_null_policy::IGNORE" - NULLIFY_OUTPUT_ROW \ - "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW" + cpdef enum class concatenate_null_policy(int32_t): + IGNORE + NULLIFY_OUTPUT_ROW cdef unique_ptr[column] concatenate_rows( const table_view input_table diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd index e7d006e6e2e..10c1c26e24e 100644 --- a/python/pylibcudf/pylibcudf/lists.pxd +++ b/python/pylibcudf/pylibcudf/lists.pxd @@ -1,7 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport ( + nan_equality, null_equality, null_order, order, size_type +) +from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy +from pylibcudf.libcudf.lists.contains cimport duplicate_find_option from .column cimport Column from .scalar cimport Scalar @@ -19,13 +23,13 @@ cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) -cpdef Column concatenate_list_elements(Column, bool dropna) +cpdef Column concatenate_list_elements(Column, concatenate_null_policy null_policy) cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) -cpdef Column index_of(Column, ColumnOrScalar, bool) +cpdef Column index_of(Column, ColumnOrScalar, duplicate_find_option) cpdef Column reverse(Column) @@ -37,16 +41,24 @@ cpdef Column count_elements(Column) cpdef Column sequences(Column, Column, Column steps = *) -cpdef Column sort_lists(Column, bool, null_order, bool stable = *) +cpdef Column sort_lists(Column, order, null_order, bool stable = *) -cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column difference_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column have_overlap( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column intersect_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column union_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) cpdef Column apply_boolean_mask(Column, Column) -cpdef Column distinct(Column, bool, bool) +cpdef Column distinct(Column, null_equality, nan_equality) diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi new file mode 100644 index 00000000000..dff6c400638 --- /dev/null +++ b/python/pylibcudf/pylibcudf/lists.pyi @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NullEquality, NullOrder, Order + +class ConcatenateNullPolicy(IntEnum): + IGNORE = ... + NULLIFY_OUTPUT_ROW = ... + +class DuplicateFindOption(IntEnum): + FIND_FIRST = ... + FIND_LAST = ... + +def explode_outer(input: Table, explode_column_idx: int) -> Table: ... +def concatenate_rows(input: Table) -> Column: ... +def concatenate_list_elements( + input: Column, null_policy: ConcatenateNullPolicy +) -> Column: ... +def contains(input: Column, search_key: Column | Scalar) -> Column: ... +def contains_nulls(input: Column) -> Column: ... +def index_of( + input: Column, + search_key: Column | Scalar, + find_option: DuplicateFindOption, +) -> Column: ... +def reverse(input: Column) -> Column: ... +def segmented_gather(input: Column, gather_map_list: Column) -> Column: ... +def extract_list_element(input: Column, index: Column | int) -> Column: ... +def count_elements(input: Column) -> Column: ... +def sequences( + starts: Column, sizes: Column, steps: Column | None = None +) -> Column: ... +def sort_lists( + input: Column, + sort_order: Order, + na_position: NullOrder, + stable: bool = False, +) -> Column: ... +def difference_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def have_overlap( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def intersect_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def union_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def apply_boolean_mask(input: Column, boolean_mask: Column) -> Column: ... +def distinct( + input: Column, nulls_equal: NullEquality, nans_equal: NanEquality +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index ecaf62d6895..ccc56eaa520 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -42,10 +42,35 @@ from pylibcudf.libcudf.types cimport ( ) from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType +from pylibcudf.libcudf.lists.combine import concatenate_null_policy as ConcatenateNullPolicy # no-cython-lint +from pylibcudf.libcudf.lists.contains import duplicate_find_option as DuplicateFindOption # no-cython-lint + from .column cimport Column, ListColumnView from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "ConcatenateNullPolicy", + "DuplicateFindOption", + "apply_boolean_mask", + "concatenate_list_elements", + "concatenate_rows", + "contains", + "contains_nulls", + "count_elements", + "difference_distinct", + "distinct", + "explode_outer", + "extract_list_element", + "have_overlap", + "index_of", + "intersect_distinct", + "reverse", + "segmented_gather", + "sequences", + "sort_lists", + "union_distinct", +] cpdef Table explode_outer(Table input, size_type explode_column_idx): """Explode a column of lists into rows. @@ -97,7 +122,9 @@ cpdef Column concatenate_rows(Table input): return Column.from_libcudf(move(c_result)) -cpdef Column concatenate_list_elements(Column input, bool dropna): +cpdef Column concatenate_list_elements( + Column input, concatenate_null_policy null_policy +): """Concatenate multiple lists on the same row into a single list. For details, see :cpp:func:`concatenate_list_elements`. @@ -106,20 +133,14 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): ---------- input : Column The input column - dropna : bool - If true, null list elements will be ignored - from concatenation. Otherwise any input null values will result in - the corresponding output row being set to null. + null_policy : ConcatenateNullPolicy + How to treat null list elements. Returns ------- Column A new Column of concatenated list elements """ - cdef concatenate_null_policy null_policy = ( - concatenate_null_policy.IGNORE if dropna - else concatenate_null_policy.NULLIFY_OUTPUT_ROW - ) cdef unique_ptr[column] c_result with nogil: @@ -191,7 +212,9 @@ cpdef Column contains_nulls(Column input): return Column.from_libcudf(move(c_result)) -cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option): +cpdef Column index_of( + Column input, ColumnOrScalar search_key, duplicate_find_option find_option +): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -207,9 +230,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o The input column. search_key : Union[Column, Scalar] The search key. - find_first_option : bool - If true, index_of returns the first match. - Otherwise the last match is returned. + find_option : DuplicateFindOption + Which match to return if there are duplicates. Returns ------- @@ -220,11 +242,6 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o """ cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef cpp_contains.duplicate_find_option find_option = ( - cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option - else cpp_contains.duplicate_find_option.FIND_LAST - ) - with nogil: c_result = cpp_contains.index_of( list_view.view(), @@ -380,7 +397,7 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None): cpdef Column sort_lists( Column input, - bool ascending, + order sort_order, null_order na_position, bool stable = False ): @@ -392,8 +409,8 @@ cpdef Column sort_lists( ---------- input : Column The input column. - ascending : bool - If true, the sort order is ascending. Otherwise, the sort order is descending. + ascending : Order + Sort order in the list. na_position : NullOrder If na_position equals NullOrder.FIRST, then the null values in the output column are placed first. Otherwise, they are be placed after. @@ -409,21 +426,17 @@ cpdef Column sort_lists( cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef order c_sort_order = ( - order.ASCENDING if ascending else order.DESCENDING - ) - with nogil: if stable: c_result = cpp_stable_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, ) else: c_result = cpp_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, ) return Column.from_libcudf(move(c_result)) @@ -432,8 +445,8 @@ cpdef Column sort_lists( cpdef Column difference_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -446,11 +459,10 @@ cpdef Column difference_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -461,19 +473,12 @@ cpdef Column difference_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.difference_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -481,8 +486,8 @@ cpdef Column difference_distinct( cpdef Column have_overlap( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Check if lists at each row of the given lists columns overlap. @@ -494,11 +499,10 @@ cpdef Column have_overlap( The input lists column for one side. rhs : Column The input lists column for the other side. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -509,19 +513,12 @@ cpdef Column have_overlap( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.have_overlap( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -529,8 +526,8 @@ cpdef Column have_overlap( cpdef Column intersect_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements common to two input lists columns. @@ -542,11 +539,10 @@ cpdef Column intersect_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -557,19 +553,12 @@ cpdef Column intersect_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.intersect_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -577,8 +566,8 @@ cpdef Column intersect_distinct( cpdef Column union_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements found in either of two input lists columns. @@ -591,11 +580,10 @@ cpdef Column union_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -606,19 +594,12 @@ cpdef Column union_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.union_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -651,7 +632,7 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): return Column.from_libcudf(move(c_result)) -cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): +cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans_equal): """Create a new list column without duplicate elements in each list. For details, see :cpp:func:`distinct`. @@ -660,11 +641,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): ---------- input : Column The input column. - nulls_equal : bool - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality + Are nulls considered equal. + nans_equal : NanEquality + Are nans considered equal. Returns ------- @@ -674,17 +654,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_distinct( list_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi new file mode 100644 index 00000000000..b18eb01f8a2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/merge.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def merge( + tables_to_merge: list[Table], + key_cols: list[int], + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index 61a21aafdb2..c051cdc0c66 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order, size_type from .table cimport Table +__all__ = ["merge"] cpdef Table merge ( list tables_to_merge, diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi new file mode 100644 index 00000000000..1a6d96a0822 --- /dev/null +++ b/python/pylibcudf/pylibcudf/null_mask.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from rmm.pylibrmm.device_buffer import DeviceBuffer + +from pylibcudf.column import Column +from pylibcudf.types import MaskState + +def copy_bitmask(col: Column) -> DeviceBuffer: ... +def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ... +def create_null_mask( + size: int, state: MaskState = MaskState.UNINITIALIZED +) -> DeviceBuffer: ... +def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... +def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 74180951562..adc264e9af6 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -14,6 +14,13 @@ from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint from .column cimport Column from .table cimport Table +__all__ = [ + "bitmask_allocation_size_bytes", + "bitmask_and", + "bitmask_or", + "copy_bitmask", + "create_null_mask", +] cdef DeviceBuffer buffer_to_python(device_buffer buf): return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf))) diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi new file mode 100644 index 00000000000..ca39aa16d7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class BPEMergePairs: + def __init__(self, merge_pairs: Column): ... + +def byte_pair_encoding( + input: Column, merge_pairs: BPEMergePairs, separator: Scalar | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx index 76caad276d4..7565b21084f 100644 --- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.scalar cimport Scalar +__all__ = ["BPEMergePairs", "byte_pair_encoding"] cdef class BPEMergePairs: """The table of merge pairs for the BPE encoder. @@ -27,6 +28,8 @@ cdef class BPEMergePairs: with nogil: self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + __hash__ = None + cpdef Column byte_pair_encoding( Column input, BPEMergePairs merge_pairs, diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi new file mode 100644 index 00000000000..85bbbb880ee --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def edit_distance(input: Column, targets: Column) -> Column: ... +def edit_distance_matrix(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx index dcacb2e1267..eceeaff24e3 100644 --- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.nvtext.edit_distance cimport ( edit_distance_matrix as cpp_edit_distance_matrix, ) +__all__ = ["edit_distance", "edit_distance_matrix"] cpdef Column edit_distance(Column input, Column targets): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi new file mode 100644 index 00000000000..2757518379d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def generate_ngrams( + input: Column, ngrams: int, separator: Scalar +) -> Column: ... +def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... +def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx index 09859d09e9e..521bc0ef4a4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -14,6 +14,11 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = [ + "generate_ngrams", + "generate_character_ngrams", + "hash_character_ngrams", +] cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi new file mode 100644 index 00000000000..18263c5c8fd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def jaccard_index(input1: Column, input2: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx index 3d8669865d9..90cace088f7 100644 --- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.jaccard cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["jaccard_index"] cpdef Column jaccard_index(Column input1, Column input2, size_type width): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi new file mode 100644 index 00000000000..a2d9b6364f7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def minhash( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def minhash64( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def word_minhash(input: Column, seeds: Column) -> Column: ... +def word_minhash64(input: Column, seeds: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 5a51e32b287..5448cc6de9b 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -20,6 +20,12 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference import warnings +__all__ = [ + "minhash", + "minhash64", + "word_minhash", + "word_minhash64", +] cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi new file mode 100644 index 00000000000..224640ed44d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def ngrams_tokenize( + input: Column, ngrams: int, delimiter: Scalar, separator: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx index 8a1854c5f0d..771c7c019fc 100644 --- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["ngrams_tokenize"] cpdef Column ngrams_tokenize( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi new file mode 100644 index 00000000000..1d90a5a8960 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def normalize_spaces(input: Column) -> Column: ... +def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index 637d900b659..b259ccaefa6 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.normalize cimport ( normalize_spaces as cpp_normalize_spaces, ) +__all__ = ["normalize_characters", "normalize_spaces"] cpdef Column normalize_spaces(Column input): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyi b/python/pylibcudf/pylibcudf/nvtext/replace.pyi new file mode 100644 index 00000000000..1f1ac72ce7c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace_tokens( + input: Column, + targets: Column, + replacements: Column, + delimiter: Scalar | None = None, +) -> Column: ... +def filter_tokens( + input: Column, + min_token_length: int, + replacement: Scalar | None = None, + delimiter: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx index b65348ce14d..a27592fb434 100644 --- a/python/pylibcudf/pylibcudf/nvtext/replace.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["filter_tokens", "replace_tokens"] cpdef Column replace_tokens( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi new file mode 100644 index 00000000000..d6ba1d189bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def is_letter( + input: Column, check_vowels: bool, indices: Column | int +) -> Column: ... +def porter_stemmer_measure(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx index 854d1053624..c9e4f1274e4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.nvtext.stemmer cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["is_letter", "porter_stemmer_measure"] cpdef Column is_letter( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi new file mode 100644 index 00000000000..f6618e296b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class HashedVocabulary: + def __init__(self, hash_file: str): ... + +def subword_tokenize( + input: Column, + vocabulary_table: HashedVocabulary, + max_sequence_length: int, + stride: int, + do_lower_case: bool, + do_truncate: bool, +) -> tuple[Column, Column, Column]: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx index 04643d3bd84..14fb6f5fe1e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( tokenizer_result as cpp_tokenizer_result, ) +__all__ = ["HashedVocabulary", "subword_tokenize"] cdef class HashedVocabulary: """The vocabulary data for use with the subword_tokenize function. @@ -24,6 +25,8 @@ cdef class HashedVocabulary: with nogil: self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) + __hash__ = None + cpdef tuple[Column, Column, Column] subword_tokenize( Column input, HashedVocabulary vocabulary_table, diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi new file mode 100644 index 00000000000..b9aa2393514 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class TokenizeVocabulary: + def __init__(self, vocab: Column): ... + +def tokenize_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def tokenize_column(input: Column, delimiters: Column) -> Column: ... +def count_tokens_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def count_tokens_column(input: Column, delimiters: Column) -> Column: ... +def character_tokenize(input: Column) -> Column: ... +def detokenize( + input: Column, row_indices: Column, separator: Scalar | None = None +) -> Column: ... +def tokenize_with_vocabulary( + input: Column, + vocabulary: TokenizeVocabulary, + delimiter: Scalar, + default_id: int = -1, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx index ec02e8ebf4e..43d426489b4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx @@ -20,6 +20,16 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = [ + "TokenizeVocabulary", + "character_tokenize", + "count_tokens_column", + "count_tokens_scalar", + "detokenize", + "tokenize_column", + "tokenize_scalar", + "tokenize_with_vocabulary", +] cdef class TokenizeVocabulary: """The Vocabulary object to be used with ``tokenize_with_vocabulary``. @@ -31,6 +41,8 @@ cdef class TokenizeVocabulary: with nogil: self.c_obj = move(cpp_load_vocabulary(c_vocab)) + __hash__ = None + cpdef Column tokenize_scalar(Column input, Scalar delimiter=None): """ Returns a single column of strings by tokenizing the input diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi new file mode 100644 index 00000000000..48a2ade23f1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def hash_partition( + input: Table, columns_to_hash: list[int], num_partitions: int +) -> tuple[Table, list[int]]: ... +def partition( + t: Table, partition_map: Column, num_partitions: int +) -> tuple[Table, list[int]]: ... +def round_robin_partition( + input: Table, num_partitions: int, start_partition: int = 0 +) -> tuple[Table, list[int]]: ... diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 3cff4843735..1dacabceb06 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -11,6 +11,11 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column from .table cimport Table +__all__ = [ + "hash_partition", + "partition", + "round_robin_partition", +] cpdef tuple[Table, list] hash_partition( Table input, diff --git a/python/pylibcudf/pylibcudf/py.typed b/python/pylibcudf/pylibcudf/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi new file mode 100644 index 00000000000..dca6eed013a --- /dev/null +++ b/python/pylibcudf/pylibcudf/quantiles.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import Interpolation, NullOrder, Order, Sorted + +def quantile( + input: Column, + q: Sequence[float], + interp: Interpolation = Interpolation.LINEAR, + ordered_indices: Column | None = None, + exact: bool = True, +) -> Column: ... +def quantiles( + input: Table, + q: Sequence[float], + interp: Interpolation = Interpolation.NEAREST, + is_input_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index 7d92b598bd0..634218586ac 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -17,6 +17,7 @@ from .column cimport Column from .table cimport Table from .types cimport interpolation +__all__ = ["quantile", "quantiles"] cpdef Column quantile( Column input, diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi new file mode 100644 index 00000000000..a09949b7b30 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reduce.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class ScanType(IntEnum): + INCLUSIVE = ... + EXCLUSIVE = ... + +def reduce(col: Column, agg: Aggregation, data_type: DataType) -> Scalar: ... +def scan(col: Column, agg: Aggregation, inclusive: ScanType) -> Column: ... +def minmax(col: Column) -> tuple[Scalar, Scalar]: ... diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index d9ec3a9bdc4..1d6ffd9de10 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -16,6 +16,7 @@ from .types cimport DataType from pylibcudf.libcudf.reduce import scan_type as ScanType # no-cython-lint +__all__ = ["ScanType", "minmax", "reduce", "scan"] cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): """Perform a reduction on a column diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi new file mode 100644 index 00000000000..eed7a2a6c52 --- /dev/null +++ b/python/pylibcudf/pylibcudf/replace.pyi @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class ReplacePolicy(IntEnum): + PRECEDING = ... + FOLLOWING = ... + +def replace_nulls( + source_column: Column, replacement: Column | Scalar | ReplacePolicy +) -> Column: ... +def find_and_replace_all( + source_column: Column, + values_to_replace: Column, + replacement_values: Column, +) -> Column: ... +def clamp( + source_column: Column, + lo: Scalar, + hi: Scalar, + lo_replace: Scalar | None = None, + hi_replace: Scalar | None = None, +) -> Column: ... +def normalize_nans_and_zeros( + source_column: Column, inplace: bool = False +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index f77eba7ace5..51be2b29277 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -15,6 +15,14 @@ from pylibcudf.libcudf.replace import \ from .column cimport Column from .scalar cimport Scalar +__all__ = [ + "ReplacePolicy", + "clamp", + "find_and_replace_all", + "normalize_nans_and_zeros", + "replace_nulls", +] + cpdef Column replace_nulls(Column source_column, ReplacementType replacement): """Replace nulls in source_column. diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi new file mode 100644 index 00000000000..d8d0ffcc3e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reshape.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def interleave_columns(source_table: Table) -> Column: ... +def tile(source_table: Table, count: int) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index 6540b5198ab..bdc212a1985 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.types cimport size_type from .column cimport Column from .table cimport Table +__all__ = ["interleave_columns", "tile"] cpdef Column interleave_columns(Table source_table): """Interleave columns of a table into a single column. diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi new file mode 100644 index 00000000000..ca0111e01ec --- /dev/null +++ b/python/pylibcudf/pylibcudf/rolling.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column + +def rolling_window[WindowType: (Column, int)]( + source: Column, + preceding_window: WindowType, + following_window: WindowType, + min_periods: int, + agg: Aggregation, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx index 4fd0b005431..11acf57ccf4 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyx +++ b/python/pylibcudf/pylibcudf/rolling.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.types cimport size_type from .aggregation cimport Aggregation from .column cimport Column +__all__ = ["rolling_window"] cpdef Column rolling_window( Column source, diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi new file mode 100644 index 00000000000..410cf5de586 --- /dev/null +++ b/python/pylibcudf/pylibcudf/round.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class RoundingMethod(IntEnum): + HALF_UP = ... + HALF_EVEN = ... + +def round( + source: Column, + decimal_places: int = 0, + round_method: RoundingMethod = RoundingMethod.HALF_UP, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index 689363e652d..09e5a9cc3bc 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.column.column cimport column from .column cimport Column +__all__ = ["RoundingMethod", "round"] cpdef Column round( Column source, diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi new file mode 100644 index 00000000000..0b72b10ef86 --- /dev/null +++ b/python/pylibcudf/pylibcudf/scalar.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class Scalar: + def type(self) -> DataType: ... + def is_valid(self) -> bool: ... + @staticmethod + def empty_like(column: Column) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index d4888a62ad1..1ac014e891e 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -11,6 +11,8 @@ from rmm.pylibrmm.memory_resource cimport get_current_device_resource from .column cimport Column from .types cimport DataType +__all__ = ["Scalar"] + # The DeviceMemoryResource attribute could be released prematurely # by the gc if the Scalar is in a reference cycle. Removing the tp_clear @@ -37,6 +39,8 @@ cdef class Scalar: # DeviceScalar. raise ValueError("Scalar should be constructed with a factory") + __hash__ = None + cdef const scalar* get(self) noexcept nogil: return self.c_obj.get() diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi new file mode 100644 index 00000000000..7f292b129b2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/search.pyi @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def lower_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def upper_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def contains(haystack: Column, needles: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index 1a870248046..50353fcd0cc 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order from .column cimport Column from .table cimport Table +__all__ = ["contains", "lower_bound", "upper_bound"] cpdef Column lower_bound( Table haystack, diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi new file mode 100644 index 00000000000..5255d869a4d --- /dev/null +++ b/python/pylibcudf/pylibcudf/sorting.pyi @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import RankMethod +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order + +def sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def stable_sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def rank( + input_view: Column, + method: RankMethod, + column_order: Order, + null_handling: NullPolicy, + null_precedence: NullOrder, + percentage: bool, +) -> Column: ... +def is_sorted( + tbl: Table, column_order: list[Order], null_precedence: list[NullOrder] +) -> bool: ... +def segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index fc40f03e1fd..fb29ef8c571 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -12,6 +12,18 @@ from pylibcudf.libcudf.types cimport null_order, null_policy, order from .column cimport Column from .table cimport Table +__all__ = [ + "is_sorted", + "rank", + "segmented_sort_by_key", + "sort", + "sort_by_key", + "sorted_order", + "stable_segmented_sort_by_key", + "stable_sort", + "stable_sort_by_key", + "stable_sorted_order", +] cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): """Computes the row indices required to sort the table. diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd index a4f39792f0c..a20a23e2e58 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd @@ -17,6 +17,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) +cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) + cpdef Table unique( Table input, list keys, diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi new file mode 100644 index 00000000000..99cade48309 --- /dev/null +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NanPolicy, NullEquality, NullPolicy + +class DuplicateKeepOption(IntEnum): + KEEP_ANY = ... + KEEP_FIRST = ... + KEEP_LAST = ... + KEEP_NONE = ... + +def drop_nulls( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def drop_nans( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def apply_boolean_mask(source_table: Table, boolean_mask: Column) -> Table: ... +def unique( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, +) -> Table: ... +def distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def distinct_indices( + input: Table, + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Column: ... +def stable_distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def unique_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... +def distinct_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index 2145398a191..6e403ca1b07 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -21,6 +21,18 @@ from pylibcudf.libcudf.stream_compaction import \ from .column cimport Column from .table cimport Table +__all__ = [ + "DuplicateKeepOption", + "apply_boolean_mask", + "distinct", + "distinct_count", + "distinct_indices", + "drop_nans", + "drop_nulls", + "stable_distinct", + "unique", + "unique_count", +] cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of nulls. diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index fa7294c7dbd..67054f0b447 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -28,6 +28,7 @@ from .side_type import SideType __all__ = [ + "SideType", "attributes", "capitalize", "case", @@ -46,9 +47,8 @@ "replace", "replace_re", "slice", - "strip", "split", - "SideType", + "strip", "translate", "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyi b/python/pylibcudf/pylibcudf/strings/attributes.pyi new file mode 100644 index 00000000000..7fd5c9773d4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def count_characters(source_strings: Column) -> Column: ... +def count_bytes(source_strings: Column) -> Column: ... +def code_points(source_strings: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx index 8e46a32835d..f1eb09b4965 100644 --- a/python/pylibcudf/pylibcudf/strings/attributes.pyx +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport attributes as cpp_attributes +__all__ = ["code_points", "count_bytes", "count_characters"] cpdef Column count_characters(Column source_strings): """ diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyi b/python/pylibcudf/pylibcudf/strings/capitalize.pyi new file mode 100644 index 00000000000..5c6689418e2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.char_types import StringCharacterTypes + +def capitalize(input: Column, delimiters: Scalar | None = None) -> Column: ... +def title( + input: Column, + sequence_type: StringCharacterTypes = StringCharacterTypes.ALPHA, +) -> Column: ... +def is_title(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx index 06b991c3cf1..a54480b8e4a 100644 --- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx @@ -14,6 +14,7 @@ from pylibcudf.strings.char_types cimport string_character_types from cython.operator import dereference +__all__ = ["capitalize", "is_title", "title"] cpdef Column capitalize( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/case.pyi b/python/pylibcudf/pylibcudf/strings/case.pyi new file mode 100644 index 00000000000..4e50db4d1da --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/case.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def to_lower(input: Column) -> Column: ... +def to_upper(input: Column) -> Column: ... +def swapcase(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx index 9e6cd7717d3..d0e054bef72 100644 --- a/python/pylibcudf/pylibcudf/strings/case.pyx +++ b/python/pylibcudf/pylibcudf/strings/case.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport case as cpp_case +__all__ = ["swapcase", "to_lower", "to_upper"] cpdef Column to_lower(Column input): cdef unique_ptr[column] c_result diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyi b/python/pylibcudf/pylibcudf/strings/char_types.pyi new file mode 100644 index 00000000000..daa36cbb68d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyi @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class StringCharacterTypes(IntEnum): + DECIMAL = ... + NUMERIC = ... + DIGIT = ... + ALPHA = ... + SPACE = ... + UPPER = ... + LOWER = ... + ALPHANUM = ... + CASE_TYPES = ... + ALL_TYPES = ... + +def all_characters_of_type( + source_strings: Column, + types: StringCharacterTypes, + verify_types: StringCharacterTypes, +) -> Column: ... +def filter_characters_of_type( + source_strings: Column, + types_to_remove: StringCharacterTypes, + replacement: Scalar, + types_to_keep: StringCharacterTypes, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index cb04efe5e8f..0af4a1f9c37 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -12,6 +12,11 @@ from cython.operator import dereference from pylibcudf.libcudf.strings.char_types import \ string_character_types as StringCharacterTypes # no-cython-lint +__all__ = [ + "StringCharacterTypes", + "all_characters_of_type", + "filter_characters_of_type", +] cpdef Column all_characters_of_type( Column source_strings, diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyi b/python/pylibcudf/pylibcudf/strings/combine.pyi new file mode 100644 index 00000000000..3094b20f141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyi @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class SeparatorOnNulls(IntEnum): + YES = ... + NO = ... + +class OutputIfEmptyList(IntEnum): + EMPTY_STRING = ... + NULL_ELEMENT = ... + +def concatenate( + strings_columns: Table, + separator: Column | Scalar, + narep: Scalar | None = None, + col_narep: Scalar | None = None, + separate_nulls: SeparatorOnNulls = SeparatorOnNulls.YES, +) -> Column: ... +def join_strings( + input: Column, separator: Scalar, narep: Scalar +) -> Column: ... +def join_list_elements( + lists_strings_column: Column, + separator: Column | Scalar, + separator_narep: Scalar, + string_narep: Scalar, + separate_nulls: SeparatorOnNulls, + empty_list_policy: OutputIfEmptyList, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index f17d5265ab4..dc1e72c799b 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -17,6 +17,13 @@ from pylibcudf.libcudf.strings.combine import \ from pylibcudf.libcudf.strings.combine import \ separator_on_nulls as SeparatorOnNulls # no-cython-lint +__all__ = [ + "OutputIfEmptyList", + "SeparatorOnNulls", + "concatenate", + "join_list_elements", + "join_strings", +] cpdef Column concatenate( Table strings_columns, diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi new file mode 100644 index 00000000000..1f0620383b3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/contains.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram + +def contains_re(input: Column, prog: RegexProgram) -> Column: ... +def count_re(input: Column, prog: RegexProgram) -> Column: ... +def matches_re(input: Column, prog: RegexProgram) -> Column: ... +def like( + input: Column, + pattern: Column | Scalar, + escape_character: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index d4b1130241d..7b4c53ed853 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["contains_re", "count_re", "like", "matches_re"] cpdef Column contains_re( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index aa27a7c8929..08b5034456e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -10,3 +10,15 @@ convert_lists, convert_urls, ) + +__all__ = [ + "convert_booleans", + "convert_datetime", + "convert_durations", + "convert_fixed_point", + "convert_floats", + "convert_integers", + "convert_ipv4", + "convert_lists", + "convert_urls", +] diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi new file mode 100644 index 00000000000..77c09242e9a --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def to_booleans(input: Column, true_string: Scalar) -> Column: ... +def from_booleans( + booleans: Column, true_string: Scalar, false_string: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx index dc12b291b11..1899a3b27cc 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -12,6 +12,7 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference +__all__ = ["from_booleans", "to_booleans"] cpdef Column to_booleans(Column input, Scalar true_string): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi new file mode 100644 index 00000000000..c6857169765 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_timestamps( + input: Column, timestamp_type: DataType, format: str +) -> Column: ... +def from_timestamps( + timestamps: Column, format: str, input_strings_names: Column +) -> Column: ... +def is_timestamp(input: Column, format: str) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index 0ee60812e00..f1cd684166c 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_timestamps", "is_timestamp", "to_timestamps"] cpdef Column to_timestamps( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi new file mode 100644 index 00000000000..a5787a5fe49 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_durations( + input: Column, duration_type: DataType, format: str +) -> Column: ... +def from_durations(durations: Column, format: str | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index 31980ace418..a9654afd00a 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_durations", "to_durations"] cpdef Column to_durations( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi new file mode 100644 index 00000000000..1192d3dfcd6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_fixed_point(input: Column, output_type: DataType) -> Column: ... +def from_fixed_point(input: Column) -> Column: ... +def is_fixed_point( + input: Column, decimal_type: DataType | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx index 962a47dfadf..00cbc822f36 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -9,6 +9,8 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType, type_id +__all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"] + cpdef Column to_fixed_point(Column input, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi new file mode 100644 index 00000000000..ddf4042e10d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_floats(strings: Column, output_type: DataType) -> Column: ... +def from_floats(floats: Column) -> Column: ... +def is_float(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx index 1296f4f9db5..b5199aac577 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType +__all__ = ["from_floats", "is_float", "to_floats"] cpdef Column to_floats(Column strings, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi new file mode 100644 index 00000000000..b96226fba90 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_integers(input: Column, output_type: DataType) -> Column: ... +def from_integers(integers: Column) -> Column: ... +def is_integer(input: Column, int_type: DataType | None = None) -> Column: ... +def hex_to_integers(input: Column, output_type: DataType) -> Column: ... +def is_hex(input: Column) -> Column: ... +def integers_to_hex(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx index 5558683a502..12984e15ce9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -9,6 +9,14 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType +__all__ = [ + "from_integers", + "hex_to_integers", + "integers_to_hex", + "is_hex", + "is_integer", + "to_integers" +] cpdef Column to_integers(Column input, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi new file mode 100644 index 00000000000..b017b32598c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def ipv4_to_integers(input: Column) -> Column: ... +def integers_to_ipv4(integers: Column) -> Column: ... +def is_ipv4(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx index 834781f95f3..e7c6aae4fa8 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 +__all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"] cpdef Column ipv4_to_integers(Column input): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi new file mode 100644 index 00000000000..6ab3a4183e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def format_list_column( + input: Column, + na_rep: Scalar | None = None, + separators: Column | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx index cbfe5f5aa8b..518f72f6644 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -17,6 +17,7 @@ from pylibcudf.types cimport type_id from cython.operator import dereference +__all__ = ["format_list_column"] cpdef Column format_list_column( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi new file mode 100644 index 00000000000..49b8468957c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def url_encode(input: Column) -> Column: ... +def url_decode(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx index 82f8a75f1d9..bd5e23bca43 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls +__all__ = ["url_decode", "url_encode"] cpdef Column url_encode(Column input): """ diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyi b/python/pylibcudf/pylibcudf/strings/extract.pyi new file mode 100644 index 00000000000..4354bd3072d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def extract(input: Column, prog: RegexProgram) -> Table: ... +def extract_all_record(input: Column, prog: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx index b56eccc8287..0ce70666e92 100644 --- a/python/pylibcudf/pylibcudf/strings/extract.pyx +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.strings.regex_program cimport RegexProgram from pylibcudf.table cimport Table +__all__ = ["extract", "extract_all_record"] cpdef Table extract(Column input, RegexProgram prog): """ diff --git a/python/pylibcudf/pylibcudf/strings/find.pyi b/python/pylibcudf/pylibcudf/strings/find.pyi new file mode 100644 index 00000000000..3d04a9c3161 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def find( + input: Column, target: Column | Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def rfind( + input: Column, target: Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def contains(input: Column, target: Column | Scalar) -> Column: ... +def starts_with(input: Column, target: Column | Scalar) -> Column: ... +def ends_with(input: Column, target: Column | Scalar) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx index 6fc6dca24fd..f0af339ff08 100644 --- a/python/pylibcudf/pylibcudf/strings/find.pyx +++ b/python/pylibcudf/pylibcudf/strings/find.pyx @@ -10,6 +10,7 @@ from cython.operator import dereference from pylibcudf.libcudf.scalar.scalar cimport string_scalar +__all__ = ["contains", "ends_with", "find", "rfind", "starts_with"] cpdef Column find( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi new file mode 100644 index 00000000000..3d46fd2fa6d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def find_multiple(input: Column, targets: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx index 672aa606bd0..c9ce734b4be 100644 --- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple +__all__ = ["find_multiple"] cpdef Column find_multiple(Column input, Column targets): """ diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyi b/python/pylibcudf/pylibcudf/strings/findall.pyi new file mode 100644 index 00000000000..77e38581d22 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram + +def find_re(input: Column, pattern: RegexProgram) -> Column: ... +def findall(input: Column, pattern: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 89fa4302824..23c84675a16 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport findall as cpp_findall from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["findall", "find_re"] cpdef Column findall(Column input, RegexProgram pattern): """ diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyi b/python/pylibcudf/pylibcudf/strings/padding.pyi new file mode 100644 index 00000000000..a991935e6e5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.side_type import SideType + +def pad( + input: Column, width: int, side: SideType, fill_char: str +) -> Column: ... +def zfill(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx index f6950eecf60..0e349a7be47 100644 --- a/python/pylibcudf/pylibcudf/strings/padding.pyx +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport padding as cpp_padding from pylibcudf.libcudf.strings.side_type cimport side_type +__all__ = ["pad", "zfill"] cpdef Column pad(Column input, size_type width, side_type side, str fill_char): """ diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyi b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi new file mode 100644 index 00000000000..c551cebf181 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class RegexFlags(IntEnum): + DEFAULT = ... + MULTILINE = ... + DOTALL = ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx index ce3b6b10a42..65b504e0dc7 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx @@ -2,3 +2,5 @@ from pylibcudf.libcudf.strings.regex_flags import \ regex_flags as RegexFlags # no-cython-lint + +__all__ = ["RegexFlags"] diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyi b/python/pylibcudf/pylibcudf/strings/regex_program.pyi new file mode 100644 index 00000000000..9abd6fa7802 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.strings.regex_flags import RegexFlags + +class RegexProgram: + def __init__(self): ... + @staticmethod + def create(pattern: str, flags: RegexFlags) -> RegexProgram: ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx index 91f585cd637..46bfde074d2 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx @@ -11,6 +11,7 @@ from pylibcudf.strings.regex_flags import RegexFlags from pylibcudf.strings.regex_flags cimport regex_flags +__all__ = ["RegexProgram"] cdef class RegexProgram: """Regex program class. @@ -24,6 +25,8 @@ cdef class RegexProgram: def __init__(self, *args, **kwargs): raise ValueError("Do not instantiate RegexProgram directly, use create") + __hash__ = None + @staticmethod def create(str pattern, int flags): """Create a program from a pattern. diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi new file mode 100644 index 00000000000..93a46b71caa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def repeat_strings(input: Column, repeat_times: Column | int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx index fb2bb13c666..a497b1f438e 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyx +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type +__all__ = ["repeat_strings"] cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): """ diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyi b/python/pylibcudf/pylibcudf/strings/replace.pyi new file mode 100644 index 00000000000..64df09ef7e8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace( + input: Column, target: Scalar, repl: Scalar, maxrepl: int = -1 +) -> Column: ... +def replace_multiple( + input: Column, target: Column, repl: Column, maxrepl: int = -1 +) -> Column: ... +def replace_slice( + input: Column, repl: Scalar | None = None, start: int = 0, stop: int = -1 +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 2b94f5e3fee..3ba6c1b5530 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.strings.replace cimport ( from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["replace", "replace_multiple", "replace_slice"] cpdef Column replace( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyi b/python/pylibcudf/pylibcudf/strings/replace_re.pyi new file mode 100644 index 00000000000..056bafbf7ef --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import overload + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_flags import RegexFlags +from pylibcudf.strings.regex_program import RegexProgram + +@overload +def replace_re( + input: Column, + pattern: RegexProgram, + replacement: Scalar, + max_replace_count: int = -1, +) -> Column: ... +@overload +def replace_re( + input: Column, + patterns: list[str], + replacement: Column, + max_replace_count: int = -1, + flags: RegexFlags = RegexFlags.DEFAULT, +) -> Column: ... +def replace_with_backrefs( + input: Column, prog: RegexProgram, replacement: str +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx index ccc33fd4425..bdabc779ddf 100644 --- a/python/pylibcudf/pylibcudf/strings/replace_re.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx @@ -16,6 +16,7 @@ from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_flags cimport regex_flags from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["replace_re", "replace_with_backrefs"] cpdef Column replace_re( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyi b/python/pylibcudf/pylibcudf/strings/side_type.pyi new file mode 100644 index 00000000000..532edd60077 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class SideType(IntEnum): + LEFT = ... + RIGHT = ... + BOTH = ... diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index cf0c770cc11..87db4206a9c 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,3 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint + +__all__ = ["SideType"] diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi new file mode 100644 index 00000000000..7bf9a7cb8c6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/slice.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def slice_strings( + input: Column, + start: Column | Scalar | None = None, + stop: Column | Scalar | None = None, + step: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx index 70d10cab36c..d32de7c50e0 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pyx +++ b/python/pylibcudf/pylibcudf/strings/slice.pyx @@ -14,6 +14,7 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference +__all__ = ["slice_strings"] cpdef Column slice_strings( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py index 2033e5e275b..db2a597882e 100644 --- a/python/pylibcudf/pylibcudf/strings/split/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -1,2 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import partition, split + +__all__ = ["partition", "split"] diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyi b/python/pylibcudf/pylibcudf/strings/split/partition.pyi new file mode 100644 index 00000000000..f19a463bd7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def partition(input: Column, delimiter: Scalar | None = None) -> Table: ... +def rpartition(input: Column, delimiter: Scalar | None = None) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx index 0fb4f186c41..75537ea46d3 100644 --- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -13,6 +13,7 @@ from pylibcudf.table cimport Table from cython.operator import dereference +__all__ = ["partition", "rpartition"] cpdef Table partition(Column input, Scalar delimiter=None): """ diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyi b/python/pylibcudf/pylibcudf/strings/split/split.pyi new file mode 100644 index 00000000000..3ccf0bc2a01 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def split( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def rsplit( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def split_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def rsplit_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def split_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def rsplit_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def split_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... +def rsplit_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx index e3827f6645e..90087f996f0 100644 --- a/python/pylibcudf/pylibcudf/strings/split/split.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -13,6 +13,16 @@ from pylibcudf.table cimport Table from cython.operator import dereference +__all__ = [ + "rsplit", + "rsplit_re", + "rsplit_record", + "rsplit_record_re", + "split", + "split_re", + "split_record", + "split_record_re", +] cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): """ diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyi b/python/pylibcudf/pylibcudf/strings/strip.pyi new file mode 100644 index 00000000000..680355fc88f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.side_type import SideType + +def strip( + input: Column, + side: SideType = SideType.BOTH, + to_strip: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx index 429a23c3cdf..805d959891b 100644 --- a/python/pylibcudf/pylibcudf/strings/strip.pyx +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings cimport strip as cpp_strip from pylibcudf.scalar cimport Scalar from pylibcudf.strings.side_type cimport side_type +__all__ = ["strip"] cpdef Column strip( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyi b/python/pylibcudf/pylibcudf/strings/translate.pyi new file mode 100644 index 00000000000..7158b6eb05c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class FilterType(IntEnum): + KEEP = ... + REMOVE = ... + +def translate( + input: Column, chars_table: Mapping[int | str, int | str] +) -> Column: ... +def filter_characters( + input: Column, + characters_to_filter: Mapping[int | str, int | str], + keep_characters: FilterType, + replacement: Scalar, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx index d85da8e6cdd..ba1e8dc5d27 100644 --- a/python/pylibcudf/pylibcudf/strings/translate.pyx +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -14,6 +14,7 @@ from cython.operator import dereference from pylibcudf.libcudf.strings.translate import \ filter_type as FilterType # no-cython-lint +__all__ = ["FilterType", "filter_characters", "translate"] cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table): """ diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyi b/python/pylibcudf/pylibcudf/strings/wrap.pyi new file mode 100644 index 00000000000..5658f279197 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def wrap(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx index 2ced250f837..b696eb48e47 100644 --- a/python/pylibcudf/pylibcudf/strings/wrap.pyx +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport wrap as cpp_wrap from pylibcudf.libcudf.types cimport size_type +__all__ = ["wrap"] cpdef Column wrap(Column input, size_type width): """ diff --git a/python/pylibcudf/pylibcudf/table.pyi b/python/pylibcudf/pylibcudf/table.pyi new file mode 100644 index 00000000000..5aef7e009c8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/table.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class Table: + def __init__(self, column: list[Column]): ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def columns(self) -> list[Column]: ... diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx index d0d6f2343d0..0c1e88a927c 100644 --- a/python/pylibcudf/pylibcudf/table.pyx +++ b/python/pylibcudf/pylibcudf/table.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column +__all__ = ["Table"] cdef class Table: """A list of columns of the same size. @@ -24,6 +25,8 @@ cdef class Table: raise ValueError("All columns must be pylibcudf Column objects") self._columns = columns + __hash__ = None + cdef table_view view(self) nogil: """Generate a libcudf table_view to pass to libcudf algorithms. diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py index bbb08e8b95a..a33122221f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py @@ -541,13 +541,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_AND, pa.compute.and_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_AND, - pa.compute.and_, - ), ( "int64", "int64", @@ -562,13 +555,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_OR, pa.compute.or_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_OR, - pa.compute.or_, - ), ( "int64", "int64", diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index beacfc63ce5..946d583d1cc 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -6,8 +6,12 @@ import pylibcudf as plc -@pytest.mark.parametrize("left_inclusive", [True, False]) -@pytest.mark.parametrize("right_inclusive", [True, False]) +@pytest.mark.parametrize( + "left_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) +@pytest.mark.parametrize( + "right_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) def test_label_bins(left_inclusive, right_inclusive): in_col = plc.interop.from_arrow(pa.array([1, 2, 3])) left_edges = plc.interop.from_arrow(pa.array([0, 5])) diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py index f3ef555f11d..8c1229c2a04 100644 --- a/python/pylibcudf/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_lists.py @@ -62,12 +62,12 @@ def test_concatenate_rows(test_data): [ ( [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], - False, + plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, [[1, 2, 3, 4, 5], None], ), ( [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]], - True, + plc.lists.ConcatenateNullPolicy.IGNORE, [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]], ), ], @@ -138,7 +138,9 @@ def test_index_of_scalar(list_column, scalar): plc_column = plc.interop.from_arrow(arr) plc_scalar = plc.interop.from_arrow(scalar) - res = plc.lists.index_of(plc_column, plc_scalar, True) + res = plc.lists.index_of( + plc_column, plc_scalar, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array([1, -1, -1, -1], type=pa.int32()) @@ -150,7 +152,9 @@ def test_index_of_list_column(list_column, search_key_column): arr2, expect = search_key_column plc_column1 = plc.interop.from_arrow(arr1) plc_column2 = plc.interop.from_arrow(arr2) - res = plc.lists.index_of(plc_column1, plc_column2, True) + res = plc.lists.index_of( + plc_column1, plc_column2, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array(search_key_column[1], type=pa.int32()) @@ -227,39 +231,34 @@ def test_sequences(): @pytest.mark.parametrize( - "ascending,na_position,expected", + "order,na_position,expected", [ ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.BEFORE, [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]], ), ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.AFTER, [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]], ), ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.BEFORE, [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]], ), ( - False, - plc.types.NullOrder.AFTER, - [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], - ), - ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.AFTER, [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], ), ], ) -def test_sort_lists(lists_column, ascending, na_position, expected): +def test_sort_lists(lists_column, order, na_position, expected): plc_column = plc.interop.from_arrow(pa.array(lists_column)) - res = plc.lists.sort_lists(plc_column, ascending, na_position, False) - res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True) + res = plc.lists.sort_lists(plc_column, order, na_position, False) + res_stable = plc.lists.sort_lists(plc_column, order, na_position, True) expect = pa.array(expected) @@ -272,44 +271,44 @@ def test_sort_lists(lists_column, ascending, na_position, expected): [ ( plc.lists.difference_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, 5]], ), ( plc.lists.difference_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, None, 5]], ), ( plc.lists.have_overlap, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [True, False, None, True], ), ( plc.lists.have_overlap, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [True, False, None, False], ), ( plc.lists.intersect_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 1, 2], [], None, [None]], ), ( plc.lists.intersect_distinct, - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[1, 2], [], None, [None]], ), ( plc.lists.union_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [ [np.nan, 2, 1, 3], [1, 2, 3, 4, 5], @@ -319,8 +318,8 @@ def test_sort_lists(lists_column, ascending, na_position, expected): ), ( plc.lists.union_distinct, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 2, 1, np.nan, 3], [1, 2, 3, 4, 5], @@ -352,20 +351,24 @@ def test_set_operations( @pytest.mark.parametrize( "nans_equal,nulls_equal,expected", [ - (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), ( - False, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, + [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], + ), + ( + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], ), ( - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], ), ( - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py index f461657281a..e85cd1cc443 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -8,7 +8,7 @@ import pylibcudf as plc -@pytest.fixture() +@pytest.fixture def str_data(): pa_data = pa.array(["A", None]) return pa_data, plc.interop.from_arrow(pa_data) diff --git a/python/pylibcudf/pylibcudf/traits.pyi b/python/pylibcudf/pylibcudf/traits.pyi new file mode 100644 index 00000000000..fdb31a262cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/traits.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.types import DataType + +def is_relationally_comparable(typ: DataType) -> bool: ... +def is_equality_comparable(typ: DataType) -> bool: ... +def is_numeric(typ: DataType) -> bool: ... +def is_numeric_not_bool(typ: DataType) -> bool: ... +def is_index_type(typ: DataType) -> bool: ... +def is_unsigned(typ: DataType) -> bool: ... +def is_integral(typ: DataType) -> bool: ... +def is_integral_not_bool(typ: DataType) -> bool: ... +def is_floating_point(typ: DataType) -> bool: ... +def is_boolean(typ: DataType) -> bool: ... +def is_timestamp(typ: DataType) -> bool: ... +def is_fixed_point(typ: DataType) -> bool: ... +def is_duration(typ: DataType) -> bool: ... +def is_chrono(typ: DataType) -> bool: ... +def is_dictionary(typ: DataType) -> bool: ... +def is_fixed_width(typ: DataType) -> bool: ... +def is_compound(typ: DataType) -> bool: ... +def is_nested(typ: DataType) -> bool: ... +def is_bit_castable(source: DataType, target: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx index 9c52e0ac1ab..3cf0a3a4b3b 100644 --- a/python/pylibcudf/pylibcudf/traits.pyx +++ b/python/pylibcudf/pylibcudf/traits.pyx @@ -5,6 +5,27 @@ from pylibcudf.libcudf.utilities cimport traits from .types cimport DataType +__all__ = [ + "is_bit_castable", + "is_boolean", + "is_chrono", + "is_compound", + "is_dictionary", + "is_duration", + "is_equality_comparable", + "is_fixed_point", + "is_fixed_width", + "is_floating_point", + "is_index_type", + "is_integral", + "is_integral_not_bool", + "is_nested", + "is_numeric", + "is_numeric_not_bool", + "is_relationally_comparable", + "is_timestamp", + "is_unsigned", +] cpdef bool is_relationally_comparable(DataType typ): """Checks if the given data type supports relational comparisons. diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi new file mode 100644 index 00000000000..5cbd2e635f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/transform.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table +from pylibcudf.types import DataType + +def nans_to_nulls(input: Column) -> tuple[gpumemoryview, int]: ... +def compute_column(input: Table, expr: Expression) -> Column: ... +def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ... +def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ... +def transform( + input: Column, unary_udf: str, output_type: DataType, is_ptx: bool +) -> Column: ... +def encode(input: Table) -> tuple[Table, Column]: ... +def one_hot_encode(input: Column, categories: Column) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index e8d95cadb0c..9700bcff221 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -18,6 +18,15 @@ from .gpumemoryview cimport gpumemoryview from .types cimport DataType from .utils cimport int_to_bitmask_ptr +__all__ = [ + "bools_to_mask", + "compute_column", + "encode", + "mask_to_bools", + "nans_to_nulls", + "one_hot_encode", + "transform", +] cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """Create a null mask preserving existing nulls and converting nans to null. diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi new file mode 100644 index 00000000000..a84ab8a60ea --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.table import Table + +def transpose(input_table: Table) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx index a24f937ced3..5eb3e58cebc 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyx +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["transpose"] cpdef Table transpose(Table input_table): """Transpose a Table. diff --git a/python/pylibcudf/pylibcudf/types.pyi b/python/pylibcudf/pylibcudf/types.pyi new file mode 100644 index 00000000000..c91a95414bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/types.pyi @@ -0,0 +1,86 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum +from typing import Final + +class Interpolation(IntEnum): + LINEAR = ... + LOWER = ... + HIGHER = ... + MIDPOINT = ... + NEAREST = ... + +class MaskState(IntEnum): + UNALLOCATED = ... + UNINITIALIZED = ... + ALL_VALID = ... + ALL_NULL = ... + +class NanEquality(IntEnum): + ALL_EQUAL = ... + UNEQUAL = ... + +class NanPolicy(IntEnum): + NAN_IS_NULL = ... + NAN_IS_VALID = ... + +class NullEquality(IntEnum): + EQUAL = ... + UNEQUAL = ... + +class NullOrder(IntEnum): + AFTER = ... + BEFORE = ... + +class NullPolicy(IntEnum): + EXCLUDE = ... + INCLUDE = ... + +class Order(IntEnum): + ASCENDING = ... + DESCENDING = ... + +class Sorted(IntEnum): + NO = ... + YES = ... + +class TypeId(IntEnum): + EMPTY = ... + INT8 = ... + INT16 = ... + INT32 = ... + INT64 = ... + UINT8 = ... + UINT16 = ... + UINT32 = ... + UINT64 = ... + FLOAT32 = ... + FLOAT64 = ... + BOOL8 = ... + TIMESTAMP_DAYS = ... + TIMESTAMP_SECONDS = ... + TIMESTAMP_MILLISECONDS = ... + TIMESTAMP_MICROSECONDS = ... + TIMESTAMP_NANOSECONDS = ... + DURATION_DAYS = ... + DURATION_SECONDS = ... + DURATION_MILLISECONDS = ... + DURATION_MICROSECONDS = ... + DURATION_NANOSECONDS = ... + DICTIONARY32 = ... + STRING = ... + LIST = ... + DECIMAL32 = ... + DECIMAL64 = ... + DECIMAL128 = ... + STRUCT = ... + NUM_TYPE_IDS = ... + +class DataType: + def __init__(self, type_id: TypeId, scale: int = 0): ... + def id(self) -> TypeId: ... + def scale(self) -> int: ... + +def size_of(t: DataType) -> int: ... + +SIZE_TYPE: Final[DataType] +SIZE_TYPE_ID: Final[TypeId] diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx index a0c31f994a3..afa1b56f38a 100644 --- a/python/pylibcudf/pylibcudf/types.pyx +++ b/python/pylibcudf/pylibcudf/types.pyx @@ -20,6 +20,22 @@ from pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, i from pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip +__all__ = [ + "DataType", + "Interpolation", + "MaskState", + "NanEquality", + "NanPolicy", + "NullEquality", + "NullOrder", + "NullPolicy", + "Order", + "SIZE_TYPE", + "SIZE_TYPE_ID", + "Sorted", + "TypeId", + "size_of" +] cdef class DataType: """Indicator for the logical data type of an element in a column. diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi new file mode 100644 index 00000000000..7aa23b618f4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/unary.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class UnaryOperator(IntEnum): + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +def unary_operation(input: Column, op: UnaryOperator) -> Column: ... +def is_null(input: Column) -> Column: ... +def is_valid(input: Column) -> Column: ... +def cast(input: Column, data_type: DataType) -> Column: ... +def is_nan(input: Column) -> Column: ... +def is_not_nan(input: Column) -> Column: ... +def is_supported_cast(from_: DataType, to: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index 53e8c382b5e..b738ab53d1b 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -13,6 +13,16 @@ from pylibcudf.libcudf.unary import \ from .column cimport Column from .types cimport DataType +__all__ = [ + "UnaryOperator", + "cast", + "is_nan", + "is_not_nan", + "is_null", + "is_supported_cast", + "is_valid", + "unary_operation", +] cpdef Column unary_operation(Column input, unary_operator op): """Perform a unary operation on a column. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index ac3018b9333..83ed95823da 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -56,13 +56,30 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/" [tool.ruff] extend = "../../pyproject.toml" +[tool.ruff.lint] +extend-select = [ + "TCH", # flake8-type-checking + "TID", # flake8-tidy-imports + "PT", # flake8-pytest-style +] +extend-ignore = [ + "PT011", # pytest.raises(...) is too broad +] + +[tool.ruff.lint.flake8-pytest-style] +# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style +fixture-parentheses = false +mark-parentheses = false +parametrize-names-type = "csv" +parametrize-values-type = "list" +parametrize-values-row-type = "tuple" + [tool.ruff.lint.isort] combine-as-imports = true -known-first-party = ["cudf"] -section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] +known-first-party = ["pylibcudf"] +section-order = ["future", "standard-library", "third-party", "rapids", "first-party", "local-folder"] [tool.ruff.lint.isort.sections] -dask = ["dask", "distributed", "dask_cuda"] rapids = ["rmm"] [tool.ruff.lint.per-file-ignores] From 796de4bd5131c38428b609c543323193f298624e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Nov 2024 11:59:04 -0500 Subject: [PATCH 37/40] Add cudf::strings::contains_multiple (#16900) Add new `cudf::strings::contains_multiple` API to search multiple targets within a strings column. Output is a table where the number of columns is the number of targets and each row is a boolean indicating that target was found at the row or not. This PR is to help in collaboration with #16641 Authors: - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) - Chong Gao (https://github.com/res-life) - Bradley Dice (https://github.com/bdice) Approvers: - Chong Gao (https://github.com/res-life) - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) - Tianyu Liu (https://github.com/kingcrimsontianyu) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16900 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/find.cpp | 14 +- cpp/benchmarks/string/find_multiple.cpp | 77 +++++ cpp/include/cudf/strings/find_multiple.hpp | 40 ++- cpp/src/strings/search/contains_multiple.cu | 316 ++++++++++++++++++++ cpp/src/strings/search/find_multiple.cu | 5 +- cpp/tests/strings/find_multiple_tests.cpp | 155 +++++++++- cpp/tests/strings/find_tests.cpp | 4 +- 9 files changed, 592 insertions(+), 21 deletions(-) create mode 100644 cpp/benchmarks/string/find_multiple.cpp create mode 100644 cpp/src/strings/search/contains_multiple.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 65b05fd518b..e237b0b2856 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -705,6 +705,7 @@ add_library( src/strings/replace/replace_slice.cu src/strings/reverse.cu src/strings/scan/scan_inclusive.cu + src/strings/search/contains_multiple.cu src/strings/search/findall.cu src/strings/search/find.cu src/strings/search/find_multiple.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 59f5602fd5a..419b78db9b0 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -375,6 +375,7 @@ ConfigureNVBench( string/count.cpp string/extract.cpp string/find.cpp + string/find_multiple.cpp string/join_strings.cpp string/lengths.cpp string/like.cpp diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 996bdcf0332..3ea3ff13a2f 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -20,9 +20,7 @@ #include #include -#include #include -#include #include #include @@ -44,15 +42,13 @@ static void bench_find_string(nvbench::state& state) auto const col = create_string_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - std::vector h_targets({"5W", "5W43", "0987 5W43"}); - cudf::string_scalar target(h_targets[2]); - cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + cudf::string_scalar target("0987 5W43"); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); - if (api.substr(0, 4) == "find") { + if (api == "find") { state.add_global_memory_writes(input.size()); } else { state.add_global_memory_writes(input.size()); @@ -61,10 +57,6 @@ static void bench_find_string(nvbench::state& state) if (api == "find") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); - } else if (api == "find_multi") { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); - }); } else if (api == "contains") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); @@ -79,7 +71,7 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_string") - .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"}) + .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp new file mode 100644 index 00000000000..0e780fdb302 --- /dev/null +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +static void bench_find_string(nvbench::state& state) +{ + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const target_count = static_cast(state.get_int64("targets")); + auto const api = state.get_string("api"); + + auto const stream = cudf::get_default_stream(); + auto const col = create_string_column(n_rows, row_width, hit_rate); + auto const input = cudf::strings_column_view(col->view()); + + // Note that these all match the first row of the raw_data in create_string_column. + // This is so the hit_rate can properly accounted for. + std::vector const target_data( + {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"}); + auto h_targets = std::vector{}; + for (cudf::size_type i = 0; i < target_count; i++) { + h_targets.emplace_back(target_data[i % target_data.size()]); + } + cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + if (api == "find") { + state.add_global_memory_writes(input.size()); + } else { + state.add_global_memory_writes(input.size()); + } + + if (api == "find") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); + }); + } else if (api == "contains") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::contains_multiple(input, cudf::strings_column_view(targets)); + }); + } +} + +NVBENCH_BENCH(bench_find_string) + .set_name("find_multiple") + .add_string_axis("api", {"find", "contains"}) + .add_int64_axis("targets", {10, 20, 40}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 1fe446db8da..e090766dd07 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -28,8 +28,42 @@ namespace strings { */ /** - * @brief Returns a lists column with character position values where each - * of the target strings are found in each string. + * @brief Searches for the given target strings within each string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null input strings return corresponding null entries in the output columns. + * + * @code{.pseudo} + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * @endcode + * + * @throw std::invalid_argument if `targets` is empty or contains nulls + * + * @param input Strings instance for this operation + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Table of BOOL8 columns + */ +std::unique_ptr
contains_multiple( + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Searches for the given target strings within each string in the provided column + * and returns the position the targets were found * * The size of the output column is `input.size()`. * Each row of the output column is of size `targets.size()`. @@ -45,7 +79,7 @@ namespace strings { * [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1 * @endcode * - * @throw cudf::logic_error if `targets` is empty or contains nulls + * @throw std::invalid_argument if `targets` is empty or contains nulls * * @param input Strings instance for this operation * @param targets Strings to search for in each string diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu new file mode 100644 index 00000000000..1183e3e4038 --- /dev/null +++ b/cpp/src/strings/search/contains_multiple.cu @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +/** + * @brief Kernel for finding multiple targets in each row of input strings + * + * The d_first_bytes is sorted and unique so the d_indices and d_offsets + * are used to map the corresponding character to its d_targets entry. + * + * Example + * d_targets = ["foo", "hello", "world", "hi"] + * - sorted first-chars: ['f','h','h','w'] + * d_indices = [0, 3, 1, 2] + * d_first_bytes = ['f', 'h', 'w'] (unique) + * d_offsets = [0, 1, 3] + * unique_count = 3 + * + * If 'h' is found, lower_bound produces pos=1 in d_first_bytes. + * This corresponds to d_offset[1]==1 which has two values: + * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2. + * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential + * in the d_indices array: + * - tgt1_idx = d_indices[map_idx] = 3 --> d_targets[3] == 'hi' + * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello' + * The logic now only needs to check for either of these 2 targets. + * + * This kernel works in either thread-per-string or warp-per-string depending + * on the template parameter. If tile_size==1, then this kernel executes as + * a row-per-string. If tile_size=32, the it executes as a warp-per-string. + * No other options are supported for now. + * + * @tparam tile_size Number of threads per string + * @param d_strings Input strings + * @param d_targets Target strings to search within input strings + * @param d_first_bytes Sorted, unique list of first bytes of the target strings + * @param d_indices Indices to map sorted d_first_bytes to d_targets + * @param d_offsets Offsets to map d_indices to d_targets + * @param unique_count Number of unique values in d_first_bytes (and d_offsets) + * @param working_memory Global memory to use if shared-memory is too small + * @param d_results Bool results for each target within each string row + */ +template +CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + bool* working_memory, + cudf::device_span d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // get the string for this tile + auto const d_str = d_strings.element(str_idx); + + namespace cg = cooperative_groups; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = tile.thread_rank(); + auto const num_targets = d_targets.size(); + + // size of shared_bools = num_targets * block_size + // each thread uses num_targets bools + extern __shared__ bool shared_bools[]; + // bools for the current string + auto bools = working_memory == nullptr + ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets)) + : (working_memory + (str_idx * tile_size * num_targets)); + + // initialize result: set true if target is empty, false otherwise + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const d_target = d_targets.element(target_idx); + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = d_target.empty(); + } else { + auto const begin = bools + (target_idx * tile_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); + } + } + tile.sync(); + + auto const last_ptr = d_first_bytes + unique_count; + for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); + str_byte_idx += tile_size) { + // search for byte in first_bytes array + auto const sptr = d_str.data() + str_byte_idx; + auto const chr = static_cast(*sptr); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto const offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; + // check for targets that begin with chr + while (map_idx < last_idx) { + auto const target_idx = d_indices[map_idx++]; + auto const bool_idx = (target_idx * tile_size) + lane_idx; + auto const found = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx]; + if (!found) { // not found before + auto const d_target = d_targets.element(target_idx); + if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { + // first char already checked, so just check the [1, end) chars match + auto const tp = d_target.data(); + if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) { + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = true; + } else { + bools[bool_idx] = true; + } + } + } + } + } + } + + if constexpr (tile_size > 1) { + tile.sync(); + // reduce the bools for each target to store in the result + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const begin = bools + (target_idx * tile_size); + d_results[target_idx][str_idx] = + thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); + // cooperative_group any() implementation was almost 3x slower than this parallel reduce + } + } +} +} // namespace + +std::unique_ptr
contains_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS( + not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument); + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_targets = column_device_view::create(targets.parent(), stream); + + // copy the first byte of each target and sort them + auto first_bytes = rmm::device_uvector(targets.size(), stream); + auto indices = rmm::device_uvector(targets.size(), stream); + { + auto tgt_itr = thrust::make_transform_iterator( + d_targets->begin(), + cuda::proclaim_return_type([] __device__(auto const& d_tgt) -> u_char { + return d_tgt.empty() ? u_char{0} : static_cast(d_tgt.data()[0]); + })); + auto count_itr = thrust::make_counting_iterator(0); + auto keys_out = first_bytes.begin(); + auto vals_out = indices.begin(); + auto num_items = targets.size(); + auto cmp_op = thrust::less(); + auto sv = stream.value(); + + std::size_t tmp_bytes = 0; + cub::DeviceMergeSort::SortPairsCopy( + nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + cub::DeviceMergeSort::SortPairsCopy( + tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + } + + // remove duplicates to help speed up lower_bound + auto offsets = rmm::device_uvector(targets.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end()); + auto const end = thrust::unique_by_key( + rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin()); + auto const unique_count = + static_cast(thrust::distance(first_bytes.begin(), end.first)); + + // create output columns + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + }); + auto results = std::vector>(results_iter, results_iter + targets.size()); + auto d_results = [&] { + auto host_results_pointer_iter = + thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + }(); + + constexpr cudf::thread_index_type block_size = 256; + // calculated (benchmarked) for efficient use of shared-memory + constexpr size_type targets_threshold = 32; + + auto d_first_bytes = first_bytes.data(); + auto d_indices = indices.data(); + auto d_offsets = offsets.data(); + + bool const row_parallel = ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)); + + if (row_parallel) { + // Smaller strings perform better with a row per string + cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; + multi_contains_kernel<1> + <<>>(*d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + nullptr, + d_results); + } else { + constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size; + + auto const shared_mem_size = + (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0; + auto const work_mem_size = + (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size(); + auto working_memory = rmm::device_uvector(work_mem_size, stream); + + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; + multi_contains_kernel + <<>>( + *d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + working_memory.data(), + d_results); + } + + return std::make_unique
(std::move(results)); +} + +} // namespace detail + +std::unique_ptr
contains_multiple(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_multiple(strings, targets, stream, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index ec7015878dd..67226b259d4 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -42,8 +42,9 @@ std::unique_ptr find_multiple(strings_column_view const& input, { auto const strings_count = input.size(); auto const targets_count = targets.size(); - CUDF_EXPECTS(targets_count > 0, "Must include at least one search target"); - CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings"); + CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument); + CUDF_EXPECTS( + !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument); auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp index 41a5940c880..3c8483b153d 100644 --- a/cpp/tests/strings/find_multiple_tests.cpp +++ b/cpp/tests/strings/find_multiple_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,158 @@ TEST_F(StringsFindMultipleTest, ErrorTest) auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); auto empty_view = cudf::strings_column_view(zero_size_strings_column); // targets must have at least one string - EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument); // targets cannot have nulls - EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument); +} + +TEST_F(StringsFindMultipleTest, MultiContains) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 9 rows: + std::vector s = { + "Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", + "", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + auto strings_view = cudf::strings_column_view(strings); + std::vector match_targets({" the ", "a", "", "é"}); + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = + cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); + + std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; + std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + + auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets) +{ + auto const strings = cudf::test::strings_column_wrapper{ + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position"}; + auto strings_view = cudf::strings_column_view(strings); + std::vector targets({"lazy brown", "non-exist", ""}); + + std::vector> expects; + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); + + std::vector match_targets; + int max_num_targets = 50; + + for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { + match_targets.clear(); + for (int i = 0; i < num_targets; i++) { + match_targets.push_back(targets[i % targets.size()]); + } + + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = cudf::strings::contains_multiple( + strings_view, cudf::strings_column_view(multi_targets_column)); + EXPECT_EQ(results->num_columns(), num_targets); + for (int i = 0; i < num_targets; i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); + } + } +} + +TEST_F(StringsFindMultipleTest, MultiContainsLongStrings) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 7 rows: + std::vector s = { + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position", + "algorithms execution is parallelized as determined by an execution policy. t algorithms " + "execution is parallelized as ", + "he this is a continuation of previous row to make sure string boundaries are honored he this " + "is a continuation of previous row", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " + "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + + auto sv = cudf::strings_column_view(strings); + auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); + auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); + + std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; + std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; + std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; + std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + auto expected_4 = make_bool_col_fn(ret_4); + auto expected_5 = make_bool_col_fn(ret_5); + + auto expected = + cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); } diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 2da95ba5c27..a3066c40650 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -17,16 +17,14 @@ #include #include #include +#include -#include #include #include #include #include #include -#include - #include struct StringsFindTest : public cudf::test::BaseFixture {}; From 1f9ad2f33867789d734c9be9bbacaabe1e348884 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 12 Nov 2024 16:20:29 -0600 Subject: [PATCH 38/40] enforce wheel size limits, README formatting in CI (#17284) Contributes to https://github.com/rapidsai/build-planning/issues/110 Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI. * checks on wheel size (compressed), - *to be sure they're under PyPI limits* - *and to prompt discussion on PRs that significantly increase wheel sizes* * checks on README formatting - *to ensure they'll render properly as the PyPI project homepages* - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/* ## Notes for Reviewers ### How I tested this Initially set the size threshold for `libcudf` to a value that I knew it'd violate (75MB compressed, when the wheels are 400+ MB compressed). Saw CI fail as expected, and print a summary with the expected contents. ```text checking 'final_dist/libcudf_cu11-24.12.0a333-py3-none-manylinux_2_28_aarch64.whl' ----- package inspection summary ----- file size * compressed size: 0.4G * uncompressed size: 0.6G * compression space saving: 34.6% contents * directories: 164 * files: 1974 (2 compiled) size by extension * .so - 0.6G (97.0%) * .h - 6.7M (1.0%) * no-extension - 4.8M (0.7%) * .cuh - 3.8M (0.6%) * .hpp - 2.2M (0.3%) * .a - 1.1M (0.2%) * .inl - 0.8M (0.1%) * .cmake - 0.1M (0.0%) * .md - 8.3K (0.0%) * .py - 4.0K (0.0%) * .pc - 0.2K (0.0%) * .txt - 34.0B (0.0%) largest files * (0.6G) libcudf/lib64/libcudf.so * (3.3M) libcudf/bin/flatc * (1.0M) libcudf/lib64/libflatbuffers.a * (0.5M) libcudf/include/libcudf/rapids/libcudacxx/cuda/std/__atomic/functions/cuda_ptx_generated.h * (0.2M) libcudf_cu11-24.12.0a333.dist-info/RECORD ------------ check results ----------- 1. [distro-too-large-compressed] Compressed size 0.4G is larger than the allowed size (75.0M). errors found while checking: 1 ``` ([build link](https://github.com/rapidsai/cudf/actions/runs/11748370606/job/32732391718?pr=17284#step:13:3062)) Updated that threshold in `python/libcudf/pyproject.toml`, and saw the build succeed (but the summary still printed). # Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17284 --- ci/build_wheel_cudf.sh | 2 ++ ci/build_wheel_cudf_polars.sh | 1 + ci/build_wheel_dask_cudf.sh | 1 + ci/build_wheel_libcudf.sh | 2 ++ ci/build_wheel_pylibcudf.sh | 2 ++ ci/validate_wheel.sh | 21 +++++++++++++++++++++ python/cudf/pyproject.toml | 8 ++++++++ python/cudf_kafka/pyproject.toml | 8 ++++++++ python/cudf_polars/pyproject.toml | 8 ++++++++ python/custreamz/pyproject.toml | 8 ++++++++ python/dask_cudf/pyproject.toml | 8 ++++++++ python/libcudf/pyproject.toml | 8 ++++++++ python/pylibcudf/pyproject.toml | 8 ++++++++ 13 files changed, 85 insertions(+) create mode 100755 ci/validate_wheel.sh diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index ae4eb0d5c66..32dd5a7fa62 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -27,4 +27,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh index 79853cdbdb2..38048125247 100755 --- a/ci/build_wheel_cudf_polars.sh +++ b/ci/build_wheel_cudf_polars.sh @@ -6,6 +6,7 @@ set -euo pipefail package_dir="python/cudf_polars" ./ci/build_wheel.sh cudf-polars ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index 00c64afa2ef..b0ae2f23abc 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -6,6 +6,7 @@ set -euo pipefail package_dir="python/dask_cudf" ./ci/build_wheel.sh dask-cudf ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index aabd3814a24..af49942c8cd 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -37,4 +37,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist" diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index c4a89f20f5f..5a8f3397714 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -25,4 +25,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh new file mode 100755 index 00000000000..5910a5c59fe --- /dev/null +++ b/ci/validate_wheel.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir=$1 +wheel_dir_relative_path=$2 + +cd "${package_dir}" + +rapids-logger "validate packages with 'pydistcheck'" + +pydistcheck \ + --inspect \ + "$(echo ${wheel_dir_relative_path}/*.whl)" + +rapids-logger "validate packages with 'twine'" + +twine check \ + --strict \ + "$(echo ${wheel_dir_relative_path}/*.whl)" diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ca6dbddfecc..280dd52bb22 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -83,6 +83,14 @@ cudf-pandas-tests = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index ec0bc0eb22b..b2ea3f06e48 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -47,6 +47,14 @@ rapids = ["rmm", "cudf", "dask_cudf"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 2e75dff5c9e..32ea142a96c 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -49,6 +49,14 @@ license-files = ["LICENSE"] [tool.setuptools.dynamic] version = {file = "cudf_polars/VERSION"} +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index d3baf3bf4d2..dd67a019c77 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -65,6 +65,14 @@ include = [ ] exclude = ["*tests*"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.ruff] extend = "../../pyproject.toml" diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index c4bfc3054bc..07d9143db36 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -81,6 +81,14 @@ section-order = ["future", "standard-library", "third-party", "dask", "rapids", dask = ["dask", "distributed", "dask_cuda"] rapids = ["rmm", "cudf"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 62726bb0df4..8c650eb2144 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -48,6 +48,14 @@ Homepage = "https://github.com/rapidsai/cudf" [project.entry-points."cmake.prefix"] libcudf = "libcudf" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 600 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '525M' + [tool.scikit-build] build-dir = "build/{wheel_tag}" cmake.build-type = "Release" diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 83ed95823da..e83db47830c 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -85,6 +85,14 @@ rapids = ["rmm"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib" From bbaa1ab1eab41d26ca2b280b3b48a73ed3f411b9 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 12 Nov 2024 22:57:21 +0000 Subject: [PATCH 39/40] Support polars 1.13 (#17299) Polars 1.13 is out, so add support for that. I needed to change some of the logic in the callback raising after @Matt711's changes, I am not sure why tests were passing previously. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17299 --- ci/test_cudf_polars_polars_tests.sh | 23 +----- ci/test_wheel_cudf_polars.sh | 23 +----- .../all_cuda-118_arch-x86_64.yaml | 2 +- .../all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf-polars/meta.yaml | 2 +- dependencies.yaml | 2 +- python/cudf_polars/cudf_polars/callback.py | 75 ++++++++----------- python/cudf_polars/cudf_polars/dsl/ir.py | 3 +- .../cudf_polars/cudf_polars/dsl/nodebase.py | 4 +- .../cudf_polars/cudf_polars/testing/plugin.py | 2 +- python/cudf_polars/pyproject.toml | 2 +- python/cudf_polars/tests/test_config.py | 2 +- 12 files changed, 44 insertions(+), 98 deletions(-) diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index f5bcdc62604..fefe26984cb 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -3,22 +3,6 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -63,9 +47,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Running polars test suite PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 2884757e46b..6c827406f78 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -3,22 +3,6 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -65,9 +49,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Testing PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 01764411346..e91443ddba8 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.13 +- polars>=1.11,<1.14 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<19.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 9074e6332d9..2dccb595e59 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.13 +- polars>=1.11,<1.14 - pre-commit - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index edf92b930d9..7a477291e7a 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.12 + - polars >=1.11,<1.14 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/dependencies.yaml b/dependencies.yaml index e47e0c7523c..b5165f82d5f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -734,7 +734,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.13 + - polars>=1.11,<1.14 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index ff4933c7564..d085f21e0ad 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -148,12 +148,7 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf( - nt: NodeTraverser, - *, - config: GPUEngine, - exception: type[Exception] | tuple[type[Exception], ...] = Exception, -) -> None: +def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -165,10 +160,15 @@ def execute_with_cudf( config GPUEngine configuration object - exception - Optional exception, or tuple of exceptions, to catch during - translation. Defaults to ``Exception``. + Raises + ------ + ValueError + If the config contains unsupported keys. + NotImplementedError + If translation of the plan is unsupported. + Notes + ----- The NodeTraverser is mutated if the libcudf executor can handle the plan. """ device = config.device @@ -178,38 +178,27 @@ def execute_with_cudf( raise ValueError( f"Engine configuration contains unsupported settings {unsupported}" ) - try: - with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - translator = Translator(nt) - ir = translator.translate_ir() - ir_translation_errors = translator.errors - if len(ir_translation_errors): - # TODO: Display these errors in user-friendly way. - # tracked in https://github.com/rapidsai/cudf/issues/17051 - unique_errors = sorted(set(ir_translation_errors), key=str) - error_message = "Query contained unsupported operations" - verbose_error_message = ( - f"{error_message}\nThe errors were:\n{unique_errors}" - ) - unsupported_ops_exception = NotImplementedError( - error_message, unique_errors - ) - if bool(int(os.environ.get("POLARS_VERBOSE", 0))): - warnings.warn(verbose_error_message, UserWarning, stacklevel=2) - if raise_on_fail: - raise unsupported_ops_exception - else: - nt.set_udf( - partial( - _callback, ir, device=device, memory_resource=memory_resource - ) - ) - except exception as e: - if bool(int(os.environ.get("POLARS_VERBOSE", 0))): - warnings.warn( - f"Query execution with GPU not supported, reason: {type(e)}: {e}", - PerformanceWarning, - stacklevel=2, + with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): + translator = Translator(nt) + ir = translator.translate_ir() + ir_translation_errors = translator.errors + if len(ir_translation_errors): + # TODO: Display these errors in user-friendly way. + # tracked in https://github.com/rapidsai/cudf/issues/17051 + unique_errors = sorted(set(ir_translation_errors), key=str) + formatted_errors = "\n".join( + f"- {e.__class__.__name__}: {e}" for e in unique_errors + ) + error_message = ( + "Query execution with GPU not possible: unsupported operations." + f"\nThe errors were:\n{formatted_errors}" + ) + exception = NotImplementedError(error_message, unique_errors) + if bool(int(os.environ.get("POLARS_VERBOSE", 0))): + warnings.warn(error_message, PerformanceWarning, stacklevel=2) + if raise_on_fail: + raise exception + else: + nt.set_udf( + partial(_callback, ir, device=device, memory_resource=memory_resource) ) - if raise_on_fail: - raise diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1f935190f28..98e8a83b04e 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -227,6 +227,7 @@ class ErrorNode(IR): def __init__(self, schema: Schema, error: str): self.schema = schema self.error = error + self.children = () class PythonScan(IR): @@ -546,7 +547,7 @@ def do_evaluate( # shifts the row index. # But prior to 1.13, polars had this wrong, so we match behaviour # https://github.com/pola-rs/polars/issues/19607 - offset += skip_rows # pragma: no cover; polars 1.13 not yet released + offset += skip_rows dtype = schema[name] step = plc.interop.from_arrow( pa.scalar(1, type=plc.interop.to_arrow(dtype)) diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py index 228d300f467..dd5c40a00be 100644 --- a/python/cudf_polars/cudf_polars/dsl/nodebase.py +++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py @@ -43,9 +43,7 @@ class Node(Generic[T]): def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]: return (*(getattr(self, attr) for attr in self._non_child), *children) - def reconstruct( - self, children: Sequence[T] - ) -> Self: # pragma: no cover; not yet used + def reconstruct(self, children: Sequence[T]) -> Self: """ Rebuild this node with new children. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 2f95cd38c57..080a1af6e19 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -40,7 +40,7 @@ def pytest_configure(config: pytest.Config) -> None: ) config.addinivalue_line( "filterwarnings", - "ignore:.*Query execution with GPU not supported", + "ignore:.*Query execution with GPU not possible", ) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 32ea142a96c..785e87391e7 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.13", + "polars>=1.11,<1.14", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 9900f598e5f..25b71716eed 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -30,7 +30,7 @@ def raise_unimplemented(self, *args): pytest.raises(pl.exceptions.ComputeError), pytest.warns( pl.exceptions.PerformanceWarning, - match="Query execution with GPU not supported", + match="Query execution with GPU not possible", ), ): # And ensure that collecting issues the correct warning. From 487f97c036ae7919e98ddc8bf5412a8002a493c5 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 12 Nov 2024 15:20:58 -0800 Subject: [PATCH 40/40] Always prefer `device_read`s and `device_write`s when kvikIO is enabled (#17260) Issue #17259 Avoid checking `_gds_read_preferred_threshold` threshold when deciding whether `device_read`/device_write` is preferred to host IO + copy. The reasons are twofold: 1. KvikIO already has an internal threshold for GDS use so we don't need to check on our end as well. 2. Without actual GDS use, kvikIO uses a pinned bounce buffer to efficiently copy to/from the device. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/17260 --- cpp/src/io/utilities/data_sink.cpp | 8 ++++++-- cpp/src/io/utilities/datasource.cpp | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 15de5d85614..68377ad6d5f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -72,8 +72,12 @@ class file_sink : public data_sink { [[nodiscard]] bool is_device_write_preferred(size_t size) const override { - if (size < _gds_write_preferred_threshold) { return false; } - return supports_device_write(); + if (!supports_device_write()) { return false; } + + // Always prefer device writes if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_write_preferred_threshold; } std::future device_write_async(void const* gpu_data, diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 5ccc91e4220..0870e4a84a7 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -95,8 +95,12 @@ class file_source : public datasource { [[nodiscard]] bool is_device_read_preferred(size_t size) const override { - if (size < _gds_read_preferred_threshold) { return false; } - return supports_device_read(); + if (!supports_device_read()) { return false; } + + // Always prefer device reads if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_read_preferred_threshold; } std::future device_read_async(size_t offset,