diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml index 9a0b4155035..d2ca78924e1 100644 --- a/.github/ops-bot.yaml +++ b/.github/ops-bot.yaml @@ -5,4 +5,3 @@ auto_merger: true branch_checker: true label_checker: true release_drafter: true -recently_updated: true diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml deleted file mode 100644 index b301c56a999..00000000000 --- a/.github/workflows/add_to_project.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Add new issue/PR to project - -on: - issues: - types: - - opened - - pull_request_target: - types: - - opened - -jobs: - add-to-project: - name: Add issue or PR to project - runs-on: ubuntu-latest - steps: - - uses: actions/add-to-project@v0.3.0 - with: - project-url: https://github.com/orgs/rapidsai/projects/51 - github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }} diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 91ec0904103..0e120d34bb1 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml deleted file mode 100644 index cf9b0c379f1..00000000000 --- a/.github/workflows/new-issues-to-triage-projects.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Auto Assign New Issues to Triage Project - -on: - issues: - types: [opened] - -env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - -jobs: - assign_one_project: - runs-on: ubuntu-latest - name: Assign to New Issues to Triage Project - steps: - - name: Process bug issues - uses: docker://takanabe/github-actions-automate-projects:v0.0.1 - if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/1 - GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing' - - name: Process feature issues - uses: docker://takanabe/github-actions-automate-projects:v0.0.1 - if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/9 - GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing' - - name: Process other issues - uses: docker://takanabe/github-actions-automate-projects:v0.0.1 - if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request')) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/10 - GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing' diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index b47a40b13d2..054ea7968c8 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -73,7 +73,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: needs: conda-python-build @@ -83,7 +83,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -93,7 +93,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build-cudf: needs: checks diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6bd2787d6dc..030f2e41db4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -32,7 +32,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" conda-python-cudf-tests: secrets: inherit @@ -63,7 +63,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -75,7 +75,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 692ba78f317..9fb991f9075 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -9,6 +9,7 @@ channels: - nvidia dependencies: - aiobotocore>=2.2.0 +- aws-sdk-cpp<1.11 - benchmark==1.8.0 - boto3>=1.21.21 - botocore>=1.24.21 @@ -55,8 +56,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 -- numpy>=1.21 +- numba>=0.57,<0.58 +- numpy>=1.21,<1.25 - numpydoc - nvcc_linux-64=11.8 - nvcomp==2.6.1 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index cf1bf4b8733..9ba0dd8dc38 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -9,6 +9,7 @@ channels: - nvidia dependencies: - aiobotocore>=2.2.0 +- aws-sdk-cpp<1.11 - benchmark==1.8.0 - boto3>=1.21.21 - botocore>=1.24.21 @@ -54,8 +55,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 -- numpy>=1.21 +- numba>=0.57,<0.58 +- numpy>=1.21,<1.25 - numpydoc - nvcomp==2.6.1 - nvtx>=0.2.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index a909b72c878..54b687faa69 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,8 +78,10 @@ requirements: - typing_extensions >=4.0.0 - pandas >=1.3,<1.6.0dev0 - cupy >=12.0.0 - - numba >=0.57 - - numpy >=1.21 + # TODO: Pin to numba<0.58 until #14160 is resolved + - numba >=0.57,<0.58 + # TODO: Pin to numpy<1.25 until cudf requires pandas 2 + - numpy >=1.21,<1.25 - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 25b3f19de77..b1f5b083e06 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -22,6 +22,9 @@ gbench_version: gtest_version: - ">=1.13.0" +aws_sdk_cpp_version: + - "<1.11" + libarrow_version: - "=12" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 627065817ba..28357f0d96d 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -74,6 +74,7 @@ requirements: - gtest {{ gtest_version }} - gmock {{ gtest_version }} - zlib {{ zlib_version }} + - aws-sdk-cpp {{ aws_sdk_cpp_version }} outputs: - name: libcudf @@ -107,6 +108,7 @@ outputs: - dlpack {{ dlpack_version }} - gtest {{ gtest_version }} - gmock {{ gtest_version }} + - aws-sdk-cpp {{ aws_sdk_cpp_version }} test: commands: - test -f $PREFIX/lib/libcudf.so diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 900e9eed98e..a84f7bd5224 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -413,11 +413,13 @@ add_library( src/io/utilities/arrow_io_source.cpp src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp + src/io/utilities/data_casting.cu src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp src/io/utilities/parsing_utils.cu src/io/utilities/row_selection.cpp + src/io/utilities/type_inference.cu src/io/utilities/trie.cu src/jit/cache.cpp src/jit/parser.cpp diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 6b8af91b842..b1aaef41340 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -141,17 +142,18 @@ std::vector select_column_names(std::vector const& col return col_names_to_read; } -std::vector segments_in_chunk(int num_segments, int num_chunks, int chunk) +std::vector segments_in_chunk(int num_segments, int num_chunks, int chunk_idx) { CUDF_EXPECTS(num_segments >= num_chunks, "Number of chunks cannot be greater than the number of segments in the file"); - auto start_segment = [num_segments, num_chunks](int chunk) { - return num_segments * chunk / num_chunks; - }; - std::vector selected_segments; - for (auto segment = start_segment(chunk); segment < start_segment(chunk + 1); ++segment) { - selected_segments.push_back(segment); - } + CUDF_EXPECTS(chunk_idx < num_chunks, + "Chunk index must be smaller than the number of chunks in the file"); + + auto const segments_in_chunk = cudf::util::div_rounding_up_unsafe(num_segments, num_chunks); + auto const begin_segment = std::min(chunk_idx * segments_in_chunk, num_segments); + auto const end_segment = std::min(begin_segment + segments_in_chunk, num_segments); + std::vector selected_segments(end_segment - begin_segment); + std::iota(selected_segments.begin(), selected_segments.end(), begin_segment); return selected_segments; } diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp index 647a411c89d..1f656f7ea70 100644 --- a/cpp/benchmarks/io/orc/orc_reader_options.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -30,7 +31,7 @@ constexpr int64_t data_size = 512 << 20; // The number of separate read calls to use when reading files in multiple chunks // Each call reads roughly equal amounts of data -constexpr int32_t chunked_read_num_chunks = 8; +constexpr int32_t chunked_read_num_chunks = 4; std::vector get_top_level_col_names(cudf::io::source_info const& source) { @@ -88,7 +89,7 @@ void BM_orc_read_varying_options(nvbench::state& state, auto const num_stripes = cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes(); - cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; + auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -99,7 +100,6 @@ void BM_orc_read_varying_options(nvbench::state& state, timer.start(); cudf::size_type rows_read = 0; for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { - auto const is_last_chunk = chunk == (num_chunks - 1); switch (RowSelection) { case row_selection::ALL: break; case row_selection::STRIPES: @@ -108,7 +108,6 @@ void BM_orc_read_varying_options(nvbench::state& state, case row_selection::NROWS: read_options.set_skip_rows(chunk * chunk_row_cnt); read_options.set_num_rows(chunk_row_cnt); - if (is_last_chunk) read_options.set_num_rows(-1); break; default: CUDF_FAIL("Unsupported row selection method"); } @@ -132,9 +131,6 @@ using col_selections = nvbench::enum_type_list; -using row_selections = - nvbench::enum_type_list; - NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, NVBENCH_TYPE_AXES(col_selections, nvbench::enum_type_list, @@ -146,6 +142,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"}) .set_min_samples(4); +using row_selections = + nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, NVBENCH_TYPE_AXES(nvbench::enum_type_list, row_selections, diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index 4105f2182d7..9f221de7da2 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,21 +27,21 @@ // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput -constexpr std::size_t data_size = 512 << 20; -constexpr std::size_t row_group_size = 128 << 20; +constexpr std::size_t data_size = 512 << 20; +// The number of separate read calls to use when reading files in multiple chunks +// Each call reads roughly equal amounts of data +constexpr int32_t chunked_read_num_chunks = 4; std::vector get_top_level_col_names(cudf::io::source_info const& source) { - cudf::io::parquet_reader_options const read_options = - cudf::io::parquet_reader_options::builder(source); - auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info; - - std::vector names; - names.reserve(schema.size()); - std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) { - return c.name; - }); - return names; + auto const top_lvl_cols = cudf::io::read_parquet_metadata(source).schema().root().children(); + std::vector col_names; + std::transform(top_lvl_cols.cbegin(), + top_lvl_cols.cend(), + std::back_inserter(col_names), + [](auto const& col_meta) { return col_meta.name(); }); + + return col_names; } template , nvbench::enum_type>) { + auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks; + auto constexpr str_to_categories = ConvertsStrings == converts_strings::YES; auto constexpr uses_pd_metadata = UsesPandasMetadata == uses_pandas_metadata::YES; @@ -87,9 +90,8 @@ void BM_parquet_read_options(nvbench::state& state, .use_pandas_metadata(uses_pd_metadata) .timestamp_type(ts_type); - // TODO: add read_parquet_metadata to properly calculate #row_groups - auto constexpr num_row_groups = data_size / row_group_size; - auto constexpr num_chunks = 1; + auto const num_row_groups = read_parquet_metadata(source_sink.make_source_info()).num_rowgroups(); + auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -100,18 +102,15 @@ void BM_parquet_read_options(nvbench::state& state, timer.start(); cudf::size_type rows_read = 0; for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { - auto const is_last_chunk = chunk == (num_chunks - 1); switch (RowSelection) { case row_selection::ALL: break; case row_selection::ROW_GROUPS: { - auto row_groups_to_read = segments_in_chunk(num_row_groups, num_chunks, chunk); - if (is_last_chunk) { - // Need to assume that an additional "overflow" row group is present - row_groups_to_read.push_back(num_row_groups); - } - read_options.set_row_groups({row_groups_to_read}); + read_options.set_row_groups({segments_in_chunk(num_row_groups, num_chunks, chunk)}); } break; - case row_selection::NROWS: [[fallthrough]]; + case row_selection::NROWS: + read_options.set_skip_rows(chunk * chunk_row_cnt); + read_options.set_num_rows(chunk_row_cnt); + break; default: CUDF_FAIL("Unsupported row selection method"); } @@ -130,14 +129,26 @@ void BM_parquet_read_options(nvbench::state& state, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } +using row_selections = + nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parquet_read_options, + NVBENCH_TYPE_AXES(nvbench::enum_type_list, + row_selections, + nvbench::enum_type_list, + nvbench::enum_type_list, + nvbench::enum_type_list)) + .set_name("parquet_read_row_selection") + .set_type_axes_names({"column_selection", + "row_selection", + "str_to_categories", + "uses_pandas_metadata", + "timestamp_type"}) + .set_min_samples(4); + using col_selections = nvbench::enum_type_list; - -// TODO: row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file -// and determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 - NVBENCH_BENCH_TYPES(BM_parquet_read_options, NVBENCH_TYPE_AXES(col_selections, nvbench::enum_type_list, diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 0319577f6b9..f3fd5cc5729 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt) cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); cudf::strings_column_view input(column->view()); + auto const separator = cudf::string_scalar("_"); for (auto _ : state) { cuda_event_timer raii(state, true); switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input); break; + case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; } } diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 423fe667b05..b556a84c541 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state) auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); }); } else if (tokenize_type == "ngrams") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); }); + auto const delimiter = cudf::string_scalar(""); + auto const separator = cudf::string_scalar("_"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator); + }); } else if (tokenize_type == "characters") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); }); diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 4731c4919e3..6532dae3695 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -16,14 +16,13 @@ #pragma once +#include + #include #include #include #include -#include -#include -#include #include #include #include @@ -32,193 +31,6 @@ namespace cudf { namespace detail { -/** - * @brief The base class for the input or output index normalizing iterator. - * - * This implementation uses CRTP to define the `input_indexalator` and the - * `output_indexalator` classes. This is so this class can manipulate the - * uniquely typed subclass member variable `p_` directly without requiring - * virtual functions since iterator instances will be copied to device memory. - * - * The base class mainly manages updating the `p_` member variable while the - * subclasses handle accessing individual elements in device memory. - * - * @tparam T The derived class type for the iterator. - */ -template -struct base_indexalator { - using difference_type = ptrdiff_t; - using value_type = size_type; - using pointer = size_type*; - using iterator_category = std::random_access_iterator_tag; - - base_indexalator() = default; - base_indexalator(base_indexalator const&) = default; - base_indexalator(base_indexalator&&) = default; - base_indexalator& operator=(base_indexalator const&) = default; - base_indexalator& operator=(base_indexalator&&) = default; - - /** - * @brief Prefix increment operator. - */ - CUDF_HOST_DEVICE inline T& operator++() - { - T& derived = static_cast(*this); - derived.p_ += width_; - return derived; - } - - /** - * @brief Postfix increment operator. - */ - CUDF_HOST_DEVICE inline T operator++(int) - { - T tmp{static_cast(*this)}; - operator++(); - return tmp; - } - - /** - * @brief Prefix decrement operator. - */ - CUDF_HOST_DEVICE inline T& operator--() - { - T& derived = static_cast(*this); - derived.p_ -= width_; - return derived; - } - - /** - * @brief Postfix decrement operator. - */ - CUDF_HOST_DEVICE inline T operator--(int) - { - T tmp{static_cast(*this)}; - operator--(); - return tmp; - } - - /** - * @brief Compound assignment by sum operator. - */ - CUDF_HOST_DEVICE inline T& operator+=(difference_type offset) - { - T& derived = static_cast(*this); - derived.p_ += offset * width_; - return derived; - } - - /** - * @brief Increment by offset operator. - */ - CUDF_HOST_DEVICE inline T operator+(difference_type offset) const - { - auto tmp = T{static_cast(*this)}; - tmp.p_ += (offset * width_); - return tmp; - } - - /** - * @brief Addition assignment operator. - */ - CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs) - { - T tmp{rhs}; - tmp.p_ += (offset * rhs.width_); - return tmp; - } - - /** - * @brief Compound assignment by difference operator. - */ - CUDF_HOST_DEVICE inline T& operator-=(difference_type offset) - { - T& derived = static_cast(*this); - derived.p_ -= offset * width_; - return derived; - } - - /** - * @brief Decrement by offset operator. - */ - CUDF_HOST_DEVICE inline T operator-(difference_type offset) const - { - auto tmp = T{static_cast(*this)}; - tmp.p_ -= (offset * width_); - return tmp; - } - - /** - * @brief Subtraction assignment operator. - */ - CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs) - { - T tmp{rhs}; - tmp.p_ -= (offset * rhs.width_); - return tmp; - } - - /** - * @brief Compute offset from iterator difference operator. - */ - CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const - { - return (static_cast(*this).p_ - rhs.p_) / width_; - } - - /** - * @brief Equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const - { - return rhs.p_ == static_cast(*this).p_; - } - /** - * @brief Not equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const - { - return rhs.p_ != static_cast(*this).p_; - } - /** - * @brief Less than operator. - */ - CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const - { - return static_cast(*this).p_ < rhs.p_; - } - /** - * @brief Greater than operator. - */ - CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const - { - return static_cast(*this).p_ > rhs.p_; - } - /** - * @brief Less than or equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const - { - return static_cast(*this).p_ <= rhs.p_; - } - /** - * @brief Greater than or equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const - { - return static_cast(*this).p_ >= rhs.p_; - } - - protected: - /** - * @brief Constructor assigns width and type member variables for base class. - */ - base_indexalator(int32_t width, data_type dtype) : width_(width), dtype_(dtype) {} - - int width_; /// integer type width = 1,2,4, or 8 - data_type dtype_; /// for type-dispatcher calls -}; - /** * @brief The index normalizing input iterator. * @@ -244,65 +56,7 @@ struct base_indexalator { * auto result = thrust::find(thrust::device, begin, end, size_type{12} ); * @endcode */ -struct input_indexalator : base_indexalator { - friend struct indexalator_factory; - friend struct base_indexalator; // for CRTP - - using reference = size_type const; // this keeps STL and thrust happy - - input_indexalator() = default; - input_indexalator(input_indexalator const&) = default; - input_indexalator(input_indexalator&&) = default; - input_indexalator& operator=(input_indexalator const&) = default; - input_indexalator& operator=(input_indexalator&&) = default; - - /** - * @brief Indirection operator returns the value at the current iterator position. - */ - __device__ inline size_type operator*() const { return operator[](0); } - - /** - * @brief Dispatch functor for resolving a size_type value from any index type. - */ - struct index_as_size_type { - template ()>* = nullptr> - __device__ size_type operator()(void const* tp) - { - return static_cast(*static_cast(tp)); - } - template ()>* = nullptr> - __device__ size_type operator()(void const* tp) - { - CUDF_UNREACHABLE("only index types are supported"); - } - }; - /** - * @brief Array subscript operator returns a value at the input - * `idx` position as a `size_type` value. - */ - __device__ inline size_type operator[](size_type idx) const - { - void const* tp = p_ + (idx * width_); - return type_dispatcher(dtype_, index_as_size_type{}, tp); - } - - protected: - /** - * @brief Create an input index normalizing iterator. - * - * Use the indexalator_factory to create an iterator instance. - * - * @param data Pointer to an integer array in device memory. - * @param width The width of the integer type (1, 2, 4, or 8) - * @param data_type Index integer type of width `width` - */ - input_indexalator(void const* data, int width, data_type dtype) - : base_indexalator(width, dtype), p_{static_cast(data)} - { - } - - char const* p_; /// pointer to the integer data in device memory -}; +using input_indexalator = input_normalator; /** * @brief The index normalizing output iterator. @@ -328,79 +82,7 @@ struct input_indexalator : base_indexalator { * thrust::less()); * @endcode */ -struct output_indexalator : base_indexalator { - friend struct indexalator_factory; - friend struct base_indexalator; // for CRTP - - using reference = output_indexalator const&; // required for output iterators - - output_indexalator() = default; - output_indexalator(output_indexalator const&) = default; - output_indexalator(output_indexalator&&) = default; - output_indexalator& operator=(output_indexalator const&) = default; - output_indexalator& operator=(output_indexalator&&) = default; - - /** - * @brief Indirection operator returns this iterator instance in order - * to capture the `operator=(size_type)` calls. - */ - __device__ inline output_indexalator const& operator*() const { return *this; } - - /** - * @brief Array subscript operator returns an iterator instance at the specified `idx` position. - * - * This allows capturing the subsequent `operator=(size_type)` call in this class. - */ - __device__ inline output_indexalator const operator[](size_type idx) const - { - output_indexalator tmp{*this}; - tmp.p_ += (idx * width_); - return tmp; - } - - /** - * @brief Dispatch functor for setting the index value from a size_type value. - */ - struct size_type_to_index { - template ()>* = nullptr> - __device__ void operator()(void* tp, size_type const value) - { - (*static_cast(tp)) = static_cast(value); - } - template ()>* = nullptr> - __device__ void operator()(void* tp, size_type const value) - { - CUDF_UNREACHABLE("only index types are supported"); - } - }; - - /** - * @brief Assign a size_type value to the current iterator position. - */ - __device__ inline output_indexalator const& operator=(size_type const value) const - { - void* tp = p_; - type_dispatcher(dtype_, size_type_to_index{}, tp, value); - return *this; - } - - protected: - /** - * @brief Create an output index normalizing iterator. - * - * Use the indexalator_factory to create an iterator instance. - * - * @param data Pointer to an integer array in device memory. - * @param width The width of the integer type (1, 2, 4, or 8) - * @param data_type Index integer type of width `width` - */ - output_indexalator(void* data, int width, data_type dtype) - : base_indexalator(width, dtype), p_{static_cast(data)} - { - } - - char* p_; /// pointer to the integer data in device memory -}; +using output_indexalator = output_normalator; /** * @brief Use this class to create an indexalator instance. @@ -413,7 +95,7 @@ struct indexalator_factory { template ()>* = nullptr> input_indexalator operator()(column_view const& indices) { - return input_indexalator(indices.data(), sizeof(IndexType), indices.type()); + return input_indexalator(indices.data(), indices.type()); } template const&>(index) creates a copy auto const scalar_impl = static_cast const*>(&index); - return input_indexalator(scalar_impl->data(), sizeof(IndexType), index.type()); + return input_indexalator(scalar_impl->data(), index.type()); } template ()>* = nullptr> output_indexalator operator()(mutable_column_view const& indices) { - return output_indexalator(indices.data(), sizeof(IndexType), indices.type()); + return output_indexalator(indices.data(), indices.type()); } template to_arrow_array(cudf::type_id id, Ts&&... args) } } +/** + * @brief Invokes an `operator()` template with the type instantiation based on + * the specified `arrow::DataType`'s `id()`. + * + * This function is analogous to libcudf's type_dispatcher, but instead applies + * to Arrow functions. Its primary use case is to leverage Arrow's + * metaprogramming facilities like arrow::TypeTraits that require translating + * the runtime dtype information into compile-time types. + */ +template +constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, + Functor f, + Ts&&... args) +{ + switch (dtype.id()) { + case arrow::Type::INT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::FLOAT: + return f.template operator()(std::forward(args)...); + case arrow::Type::DOUBLE: + return f.template operator()(std::forward(args)...); + case arrow::Type::BOOL: + return f.template operator()(std::forward(args)...); + case arrow::Type::TIMESTAMP: + return f.template operator()(std::forward(args)...); + case arrow::Type::DURATION: + return f.template operator()(std::forward(args)...); + case arrow::Type::STRING: + return f.template operator()(std::forward(args)...); + case arrow::Type::LIST: + return f.template operator()(std::forward(args)...); + case arrow::Type::DECIMAL128: + return f.template operator()(std::forward(args)...); + case arrow::Type::STRUCT: + return f.template operator()(std::forward(args)...); + default: { + CUDF_FAIL("Invalid type."); + } + } +} + // Converting arrow type to cudf type data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); /** - * @copydoc cudf::to_arrow - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @copydoc cudf::to_arrow(table_view input, std::vector const& metadata, + * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata, @@ -118,13 +172,27 @@ std::shared_ptr to_arrow(table_view input, arrow::MemoryPool* ar_mr); /** - * @copydoc cudf::arrow_to_cudf - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata, + * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) + */ +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr); +/** + * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) */ std::unique_ptr from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) + */ +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh new file mode 100644 index 00000000000..51b3133f84f --- /dev/null +++ b/cpp/include/cudf/detail/normalizing_iterator.cuh @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace detail { + +/** + * @brief The base class for the input or output normalizing iterator + * + * The base class mainly manages updating the `p_` member variable while the + * subclasses handle accessing individual elements in device memory. + * + * @tparam Derived The derived class type for the iterator + * @tparam Integer The type the iterator normalizes to + */ +template +struct base_normalator { + static_assert(std::is_integral_v); + using difference_type = std::ptrdiff_t; + using value_type = Integer; + using pointer = Integer*; + using iterator_category = std::random_access_iterator_tag; + + base_normalator() = default; + base_normalator(base_normalator const&) = default; + base_normalator(base_normalator&&) = default; + base_normalator& operator=(base_normalator const&) = default; + base_normalator& operator=(base_normalator&&) = default; + + /** + * @brief Prefix increment operator. + */ + CUDF_HOST_DEVICE inline Derived& operator++() + { + Derived& derived = static_cast(*this); + derived.p_ += width_; + return derived; + } + + /** + * @brief Postfix increment operator. + */ + CUDF_HOST_DEVICE inline Derived operator++(int) + { + Derived tmp{static_cast(*this)}; + operator++(); + return tmp; + } + + /** + * @brief Prefix decrement operator. + */ + CUDF_HOST_DEVICE inline Derived& operator--() + { + Derived& derived = static_cast(*this); + derived.p_ -= width_; + return derived; + } + + /** + * @brief Postfix decrement operator. + */ + CUDF_HOST_DEVICE inline Derived operator--(int) + { + Derived tmp{static_cast(*this)}; + operator--(); + return tmp; + } + + /** + * @brief Compound assignment by sum operator. + */ + CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset) + { + Derived& derived = static_cast(*this); + derived.p_ += offset * width_; + return derived; + } + + /** + * @brief Increment by offset operator. + */ + CUDF_HOST_DEVICE inline Derived operator+(difference_type offset) const + { + auto tmp = Derived{static_cast(*this)}; + tmp.p_ += (offset * width_); + return tmp; + } + + /** + * @brief Addition assignment operator. + */ + CUDF_HOST_DEVICE inline friend Derived operator+(difference_type offset, Derived const& rhs) + { + Derived tmp{rhs}; + tmp.p_ += (offset * rhs.width_); + return tmp; + } + + /** + * @brief Compound assignment by difference operator. + */ + CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset) + { + Derived& derived = static_cast(*this); + derived.p_ -= offset * width_; + return derived; + } + + /** + * @brief Decrement by offset operator. + */ + CUDF_HOST_DEVICE inline Derived operator-(difference_type offset) const + { + auto tmp = Derived{static_cast(*this)}; + tmp.p_ -= (offset * width_); + return tmp; + } + + /** + * @brief Subtraction assignment operator. + */ + CUDF_HOST_DEVICE inline friend Derived operator-(difference_type offset, Derived const& rhs) + { + Derived tmp{rhs}; + tmp.p_ -= (offset * rhs.width_); + return tmp; + } + + /** + * @brief Compute offset from iterator difference operator. + */ + CUDF_HOST_DEVICE inline difference_type operator-(Derived const& rhs) const + { + return (static_cast(*this).p_ - rhs.p_) / width_; + } + + /** + * @brief Equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator==(Derived const& rhs) const + { + return rhs.p_ == static_cast(*this).p_; + } + + /** + * @brief Not equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator!=(Derived const& rhs) const + { + return rhs.p_ != static_cast(*this).p_; + } + + /** + * @brief Less than operator. + */ + CUDF_HOST_DEVICE inline bool operator<(Derived const& rhs) const + { + return static_cast(*this).p_ < rhs.p_; + } + + /** + * @brief Greater than operator. + */ + CUDF_HOST_DEVICE inline bool operator>(Derived const& rhs) const + { + return static_cast(*this).p_ > rhs.p_; + } + + /** + * @brief Less than or equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator<=(Derived const& rhs) const + { + return static_cast(*this).p_ <= rhs.p_; + } + + /** + * @brief Greater than or equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator>=(Derived const& rhs) const + { + return static_cast(*this).p_ >= rhs.p_; + } + + protected: + /** + * @brief Constructor assigns width and type member variables for base class. + */ + explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {} + + int width_; /// integer type width = 1,2,4, or 8 + data_type dtype_; /// for type-dispatcher calls +}; + +/** + * @brief The integer normalizing input iterator + * + * This is an iterator that can be used for index types (integers) without + * requiring a type-specific instance. It can be used for any iterator + * interface for reading an array of integer values of type + * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. + * Reading specific elements always return a type of `Integer` + * + * @tparam Integer Type returned by all read functions + */ +template +struct input_normalator : base_normalator, Integer> { + friend struct base_normalator, Integer>; // for CRTP + + using reference = Integer const; // this keeps STL and thrust happy + + input_normalator() = default; + input_normalator(input_normalator const&) = default; + input_normalator(input_normalator&&) = default; + input_normalator& operator=(input_normalator const&) = default; + input_normalator& operator=(input_normalator&&) = default; + + /** + * @brief Indirection operator returns the value at the current iterator position + */ + __device__ inline Integer operator*() const { return operator[](0); } + + /** + * @brief Dispatch functor for resolving a Integer value from any integer type + */ + struct normalize_type { + template >* = nullptr> + __device__ Integer operator()(void const* tp) + { + return static_cast(*static_cast(tp)); + } + template >* = nullptr> + __device__ Integer operator()(void const*) + { + CUDF_UNREACHABLE("only integral types are supported"); + } + }; + + /** + * @brief Array subscript operator returns a value at the input + * `idx` position as a `Integer` value. + */ + __device__ inline Integer operator[](size_type idx) const + { + void const* tp = p_ + (idx * this->width_); + return type_dispatcher(this->dtype_, normalize_type{}, tp); + } + + /** + * @brief Create an input index normalizing iterator. + * + * Use the indexalator_factory to create an iterator instance. + * + * @param data Pointer to an integer array in device memory. + * @param data_type Type of data in data + */ + input_normalator(void const* data, data_type dtype) + : base_normalator, Integer>(dtype), p_{static_cast(data)} + { + } + + char const* p_; /// pointer to the integer data in device memory +}; + +/** + * @brief The integer normalizing output iterator + * + * This is an iterator that can be used for index types (integers) without + * requiring a type-specific instance. It can be used for any iterator + * interface for writing an array of integer values of type + * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. + * Setting specific elements always accept the `Integer` type values. + * + * @tparam Integer The type used for all write functions + */ +template +struct output_normalator : base_normalator, Integer> { + friend struct base_normalator, Integer>; // for CRTP + + using reference = output_normalator const&; // required for output iterators + + output_normalator() = default; + output_normalator(output_normalator const&) = default; + output_normalator(output_normalator&&) = default; + output_normalator& operator=(output_normalator const&) = default; + output_normalator& operator=(output_normalator&&) = default; + + /** + * @brief Indirection operator returns this iterator instance in order + * to capture the `operator=(Integer)` calls. + */ + __device__ inline output_normalator const& operator*() const { return *this; } + + /** + * @brief Array subscript operator returns an iterator instance at the specified `idx` position. + * + * This allows capturing the subsequent `operator=(Integer)` call in this class. + */ + __device__ inline output_normalator const operator[](size_type idx) const + { + output_normalator tmp{*this}; + tmp.p_ += (idx * this->width_); + return tmp; + } + + /** + * @brief Dispatch functor for setting the index value from a size_type value. + */ + struct normalize_type { + template >* = nullptr> + __device__ void operator()(void* tp, Integer const value) + { + (*static_cast(tp)) = static_cast(value); + } + template >* = nullptr> + __device__ void operator()(void*, Integer const) + { + CUDF_UNREACHABLE("only index types are supported"); + } + }; + + /** + * @brief Assign an Integer value to the current iterator position + */ + __device__ inline output_normalator const& operator=(Integer const value) const + { + void* tp = p_; + type_dispatcher(this->dtype_, normalize_type{}, tp, value); + return *this; + } + + /** + * @brief Create an output normalizing iterator + * + * @param data Pointer to an integer array in device memory. + * @param data_type Type of data in data + */ + output_normalator(void* data, data_type dtype) + : base_normalator, Integer>(dtype), p_{static_cast(data)} + { + } + + char* p_; /// pointer to the integer data in device memory +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp index fb13eabe11a..959b785bf87 100644 --- a/cpp/include/cudf/dictionary/encode.hpp +++ b/cpp/include/cudf/dictionary/encode.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,12 +53,14 @@ namespace dictionary { * * @param column The column to dictionary encode * @param indices_type The integer type to use for the indices + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Returns a dictionary column */ std::unique_ptr encode( column_view const& column, data_type indices_type = data_type{type_id::UINT32}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -72,11 +74,13 @@ std::unique_ptr encode( * @endcode * * @param dictionary_column Existing dictionary column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with type matching the dictionary_column's keys */ std::unique_ptr decode( dictionary_column_view const& dictionary_column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp index ed7a9c84693..1b72cf42acd 100644 --- a/cpp/include/cudf/dictionary/search.hpp +++ b/cpp/include/cudf/dictionary/search.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,12 +37,14 @@ namespace dictionary { * * @param dictionary The dictionary to search for the key. * @param key The value to search for in the dictionary keyset. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned scalar's device memory. - * @return Numeric scalar index value of the key within the dictionary + * @return Numeric scalar index value of the key within the dictionary. */ std::unique_ptr get_index( dictionary_column_view const& dictionary, scalar const& key, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp index 2fcfb5e1f7c..81728e1ff73 100644 --- a/cpp/include/cudf/dictionary/update_keys.hpp +++ b/cpp/include/cudf/dictionary/update_keys.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,13 +51,15 @@ namespace dictionary { * @throw cudf_logic_error if the new_keys contain nulls. * * @param dictionary_column Existing dictionary column. - * @param new_keys New keys to incorporate into the dictionary_column + * @param new_keys New keys to incorporate into the dictionary_column. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column. */ std::unique_ptr add_keys( dictionary_column_view const& dictionary_column, column_view const& new_keys, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -81,13 +83,15 @@ std::unique_ptr add_keys( * @throw cudf_logic_error if the keys_to_remove contain nulls. * * @param dictionary_column Existing dictionary column. - * @param keys_to_remove The keys to remove from the dictionary_column + * @param keys_to_remove The keys to remove from the dictionary_column. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column. */ std::unique_ptr remove_keys( dictionary_column_view const& dictionary_column, column_view const& keys_to_remove, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -103,11 +107,13 @@ std::unique_ptr remove_keys( * @endcode * * @param dictionary_column Existing dictionary column. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column. */ std::unique_ptr remove_unused_keys( dictionary_column_view const& dictionary_column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -134,12 +140,14 @@ std::unique_ptr remove_unused_keys( * * @param dictionary_column Existing dictionary column. * @param keys New keys to use for the output column. Must not contain nulls. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column. */ std::unique_ptr set_keys( dictionary_column_view const& dictionary_column, column_view const& keys, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -149,11 +157,13 @@ std::unique_ptr set_keys( * The result is a vector of new dictionaries with a common set of keys. * * @param input Dictionary columns to match keys. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary columns. */ std::vector> match_dictionaries( cudf::host_span input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/include/cudf/hashing/detail/hash_allocator.cuh similarity index 100% rename from cpp/src/hash/hash_allocator.cuh rename to cpp/include/cudf/hashing/detail/hash_allocator.cuh diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh similarity index 100% rename from cpp/src/hash/helper_functions.cuh rename to cpp/include/cudf/hashing/detail/helper_functions.cuh diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index e210179b147..865cc004107 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -126,23 +126,56 @@ struct column_metadata { * * @param input table_view that needs to be converted to arrow Table * @param metadata Contains hierarchy of names of columns and children + * @param stream CUDA stream used for device memory operations and kernel launches * @param ar_mr arrow memory pool to allocate memory for arrow Table * @return arrow Table generated from `input` */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata = {}, - arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); +/** + * @brief Create `arrow::Scalar` from cudf scalar `input` + * + * Converts the `cudf::scalar` to `arrow::Scalar`. + * + * @param input scalar that needs to be converted to arrow Scalar + * @param metadata Contains hierarchy of names of columns and children + * @param stream CUDA stream used for device memory operations and kernel launches + * @param ar_mr arrow memory pool to allocate memory for arrow Scalar + * @return arrow Scalar generated from `input` + */ +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief Create `cudf::table` from given arrow Table input * * @param input arrow:Table that needs to be converted to `cudf::table` + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow Table */ std::unique_ptr
from_arrow( arrow::Table const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create `cudf::scalar` from given arrow Scalar input + * + * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::scalar` + * @return cudf scalar generated from given arrow Scalar + */ + +std::unique_ptr from_arrow( + arrow::Scalar const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh deleted file mode 100644 index b7ee5e05e96..00000000000 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include - -namespace cudf::io::json::detail { - -// Unicode code point escape sequence -static constexpr char UNICODE_SEQ = 0x7F; - -// Invalid escape sequence -static constexpr char NON_ESCAPE_CHAR = 0x7E; - -// Unicode code point escape sequence prefix comprises '\' and 'u' characters -static constexpr size_type UNICODE_ESC_PREFIX = 2; - -// Unicode code point escape sequence comprises four hex characters -static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4; - -// A unicode code point escape sequence is \uXXXX -static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT; - -static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800; -static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; -static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; -static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; - -/** - * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or - * whether type casting failed. - */ -enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE }; - -/** - * @brief Providing additional information about the type casting result. - */ -struct data_casting_result_info { - // Number of bytes written to output - size_type bytes; - // Whether parsing succeeded, item was parsed to null, or failed - data_casting_result result; -}; - -/** - * @brief Returns the character to output for a given escaped character that's following a - * backslash. - * - * @param escaped_char The character following the backslash. - * @return The character to output for a given character that's following a backslash - */ -__device__ __forceinline__ char get_escape_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return '"'; - case '\\': return '\\'; - case '/': return '/'; - case 'b': return '\b'; - case 'f': return '\f'; - case 'n': return '\n'; - case 'r': return '\r'; - case 't': return '\t'; - case 'u': return UNICODE_SEQ; - default: return NON_ESCAPE_CHAR; - } -} - -/** - * @brief Returns the escaped characters for a given character. - * - * @param escaped_char The character to escape. - * @return The escaped characters for a given character. - */ -__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return {'\\', '"'}; - case '\\': return {'\\', '\\'}; - case '/': return {'\\', '/'}; - case '\b': return {'\\', 'b'}; - case '\f': return {'\\', 'f'}; - case '\n': return {'\\', 'n'}; - case '\r': return {'\\', 'r'}; - case '\t': return {'\\', 't'}; - // case 'u': return UNICODE_SEQ; - default: return {'\0', escaped_char}; - } -} -/** - * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence - * \uXXXX. - * - * @param str Pointer to the first (most-significant) hex digit - * @return The parsed hex value if successful, -1 otherwise. - */ -__device__ __forceinline__ int32_t parse_unicode_hex(char const* str) -{ - // Prepare result - int32_t result = 0, base = 1; - constexpr int32_t hex_radix = 16; - - // Iterate over hex digits right-to-left - size_type index = UNICODE_HEX_DIGIT_COUNT; - while (index-- > 0) { - char const ch = str[index]; - if (ch >= '0' && ch <= '9') { - result += static_cast((ch - '0') + 0) * base; - base *= hex_radix; - } else if (ch >= 'A' && ch <= 'F') { - result += static_cast((ch - 'A') + 10) * base; - base *= hex_radix; - } else if (ch >= 'a' && ch <= 'f') { - result += static_cast((ch - 'a') + 10) * base; - base *= hex_radix; - } else { - return -1; - } - } - return result; -} - -/** - * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to - * \p out_it - */ -constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) -{ - auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) - : strings::detail::from_char_utf8(character, out_it); - if (out_it) out_it += bytes; - return bytes; -} - -/** - * @brief Processes a string, replaces escape sequences and optionally strips off the quote - * characters. - * - * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to - * char - * @param in_begin Iterator to the first item to process - * @param in_end Iterator to one past the last item to process - * @param d_buffer Output character buffer to the first item to write - * @param options Settings for controlling string processing behavior - * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is - * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether - * parsing succeeded, item was parsed to null, or failed. - */ -template -__device__ __forceinline__ data_casting_result_info -process_string(in_iterator_t in_begin, - in_iterator_t in_end, - char* d_buffer, - cudf::io::parse_options_view const& options) -{ - int32_t bytes = 0; - auto const num_in_chars = thrust::distance(in_begin, in_end); - // String values are indicated by keeping the quote character - bool const is_string_value = - num_in_chars >= 2LL && - (options.quotechar == '\0' || - (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); - - // Copy literal/numeric value - if (not is_string_value) { - while (in_begin != in_end) { - if (d_buffer) *d_buffer++ = *in_begin; - ++in_begin; - ++bytes; - } - return {bytes, data_casting_result::PARSING_SUCCESS}; - } - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - char const backslash_char = '\\'; - - // Escape-flag, set after encountering a backslash character - bool escape = false; - - // Exclude beginning and ending quote chars from string range - if (!options.keepquotes) { - ++in_begin; - --in_end; - } - - // Iterate over the input - while (in_begin != in_end) { - // Copy single character to output - if (!escape) { - escape = (*in_begin == backslash_char); - if (!escape) { - if (d_buffer) *d_buffer++ = *in_begin; - ++bytes; - } - ++in_begin; - continue; - } - - // Previous char indicated beginning of escape sequence - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(*in_begin); - - // We escaped an invalid escape character -> "fail"/null for this item - if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; } - - // Regular, single-character escape - if (escaped_char != UNICODE_SEQ) { - if (d_buffer) *d_buffer++ = escaped_char; - ++bytes; - ++in_begin; - continue; - } - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - // Skip over the 'u' char from \uXXXX to the first hex digit - ++in_begin; - - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) { - return {bytes, data_casting_result::PARSING_FAILURE}; - } - - auto hex_val = parse_unicode_hex(in_begin); - - // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item - if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; } - - // Skip over the four hex digits - thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - int32_t hex_low_val = 0; - if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && - *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { - // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low - // surrogate - hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2)); - } - - // This is indeed a UTF16 surrogate pair - if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && - hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { - // Skip over the second \uXXXX sequence - thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS); - - // Compute UTF16-encoded code point - uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + - (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); - auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); - bytes += write_utf8_char(utf8_chars, d_buffer); - } - - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - bytes += write_utf8_char(utf8_chars, d_buffer); - } - } - - // The last character of the input is a backslash -> "fail"/null for this item - if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } - return {bytes, data_casting_result::PARSING_SUCCESS}; -} - -template -struct string_parse { - str_tuple_it str_tuples; - bitmask_type* null_mask; - size_type* null_count_data; - cudf::io::parse_options_view const options; - size_type* d_offsets{}; - char* d_chars{}; - - __device__ void operator()(size_type idx) - { - if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const in_begin = str_tuples[idx].first; - auto const in_end = in_begin + str_tuples[idx].second; - auto const num_in_chars = str_tuples[idx].second; - - // Check if the value corresponds to the null literal - auto const is_null_literal = - (!d_chars) && - serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal && null_mask != nullptr) { - clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[idx] = 0; - return; - } - - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - auto str_process_info = process_string(in_begin, in_end, d_buffer, options); - if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { - if (null_mask != nullptr) { - clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); - } - if (!d_chars) d_offsets[idx] = 0; - } else { - if (!d_chars) d_offsets[idx] = str_process_info.bytes; - } - } -}; -/** - * @brief Parses the data from an iterator of string views, casting it to the given target data type - * - * @param str_tuples Iterator returning a string view, i.e., a (ptr, length) pair - * @param col_size The total number of items of this column - * @param col_type The column's target data type - * @param null_mask A null mask that renders certain items from the input invalid - * @param options Settings for controlling the processing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr The resource to be used for device memory allocation - * @return The column that contains the parsed data - */ -template -std::unique_ptr parse_data(str_tuple_it str_tuples, - size_type col_size, - data_type col_type, - B&& null_mask, - size_type null_count, - cudf::io::parse_options_view const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - - auto d_null_count = rmm::device_scalar(null_count, stream); - auto null_count_data = d_null_count.data(); - - if (col_type == cudf::data_type{cudf::type_id::STRING}) { - // this utility calls the functor to build the offsets and chars columns; - // the bitmask and null count may be updated by parse failures - auto [offsets, chars] = cudf::strings::detail::make_strings_children( - string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options}, - col_size, - stream, - mr); - - return make_strings_column(col_size, - std::move(offsets), - std::move(chars), - d_null_count.value(stream), - std::move(null_mask)); - } - - auto out_col = - make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); - auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); - - // use existing code (`ConvertFunctor`) to convert values - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( - size_type row) { - if (col.is_null(row)) { return; } - auto const in = str_tuples[row]; - - auto const is_null_literal = - serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); - - if (is_null_literal) { - col.set_null(row); - atomicAdd(null_count_data, 1); - return; - } - - // If this is a string value, remove quotes - auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); - - auto const is_parsed = cudf::type_dispatcher(col_type, - ConvertFunctor{}, - in_begin, - in_end, - col.data(), - row, - col_type, - options, - false); - if (not is_parsed) { - col.set_null(row); - atomicAdd(null_count_data, 1); - } - }); - - out_col->set_null_count(d_null_count.value(stream)); - - return out_col; -} - -} // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp index efdb85691bd..ec93c709163 100644 --- a/cpp/include/cudf/rolling.hpp +++ b/cpp/include/cudf/rolling.hpp @@ -199,10 +199,30 @@ struct window_bounds { * column of the same type as the input. Therefore it is suggested to convert integer column types * (especially low-precision integers) to `FLOAT32` or `FLOAT64` before doing a rolling `MEAN`. * + * Note: `preceding_window` and `following_window` could well have negative values. This yields + * windows where the current row might not be included at all. For instance, consider a window + * defined as (preceding=3, following=-1). This produces a window from 2 (i.e. 3-1) rows preceding + * the current row, and 1 row *preceding* the current row. For the example above, the window for + * row#3 is: + * + * [ 10, 20, 10, 50, 60, 20, 30, 80, 40 ] + * <--window--> ^ + * | + * current_row + * + * Similarly, `preceding` could have a negative value, indicating that the window begins at a + * position after the current row. It differs slightly from the semantics for `following`, because + * `preceding` includes the current row. Therefore: + * 1. preceding=1 => Window starts at the current row. + * 2. preceding=0 => Window starts at 1 past the current row. + * 3. preceding=-1 => Window starts at 2 past the current row. Etc. + * * @param[in] group_keys The (pre-sorted) grouping columns * @param[in] input The input column (to be aggregated) - * @param[in] preceding_window The static rolling window size in the backward direction - * @param[in] following_window The static rolling window size in the forward direction + * @param[in] preceding_window The static rolling window size in the backward direction (for + * positive values), or forward direction (for negative values) + * @param[in] following_window The static rolling window size in the forward direction (for positive + * values), or backward direction (for negative values) * @param[in] min_periods Minimum number of observations in window required to have a value, * otherwise element `i` is null. * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.) diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index 6924e77ae9b..e4e803b2d3c 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -43,6 +44,7 @@ namespace cudf { * @param null_precedence The desired order of null compared to other elements * for each column. Size must be equal to `input.num_columns()` or empty. * If empty, all columns will be sorted in `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A non-nullable column of elements containing the permuted row indices of * `input` if it were sorted @@ -51,6 +53,7 @@ std::unique_ptr sorted_order( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -65,27 +68,30 @@ std::unique_ptr stable_sorted_order( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Checks whether the rows of a `table` are sorted in a lexicographical * order. * - * @param[in] table Table whose rows need to be compared for ordering - * @param[in] column_order The expected sort order for each column. Size - * must be equal to `in.num_columns()` or empty. If - * empty, it is expected all columns are in - * ascending order. - * @param[in] null_precedence The desired order of null compared to other - * elements for each column. Size must be equal to - * `input.num_columns()` or empty. If empty, - * `null_order::BEFORE` is assumed for all columns. - * - * @returns bool true if sorted as expected, false if not + * @param table Table whose rows need to be compared for ordering + * @param column_order The expected sort order for each column. Size + * must be equal to `in.num_columns()` or empty. If + * empty, it is expected all columns are in + * ascending order. + * @param null_precedence The desired order of null compared to other + * elements for each column. Size must be equal to + * `input.num_columns()` or empty. If empty, + * `null_order::BEFORE` is assumed for all columns. + * + * @param stream CUDA stream used for device memory operations and kernel launches + * @returns true if sorted as expected, false if not */ bool is_sorted(cudf::table_view const& table, std::vector const& column_order, - std::vector const& null_precedence); + std::vector const& null_precedence, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Performs a lexicographic sort of the rows of a table @@ -98,6 +104,7 @@ bool is_sorted(cudf::table_view const& table, * elements for each column in `input`. Size must be equal to * `input.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return New table containing the desired sorted order of `input` */ @@ -105,6 +112,7 @@ std::unique_ptr
sort( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -124,6 +132,7 @@ std::unique_ptr
sort( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return The reordering of `values` determined by the lexicographic order of * the rows of `keys`. @@ -133,6 +142,7 @@ std::unique_ptr
sort_by_key( table_view const& keys, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -154,6 +164,7 @@ std::unique_ptr
sort_by_key( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return The reordering of `values` determined by the lexicographic order of * the rows of `keys`. @@ -163,6 +174,7 @@ std::unique_ptr
stable_sort_by_key( table_view const& keys, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -189,6 +201,7 @@ std::unique_ptr
stable_sort_by_key( * @param null_precedence The desired order of null compared to other elements * for column * @param percentage flag to convert ranks to percentage in range (0,1] + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A column of containing the rank of the each element of the column of `input`. The output * column type will be `size_type`column by default or else `double` when @@ -201,6 +214,7 @@ std::unique_ptr rank( null_policy null_handling, null_order null_precedence, bool percentage, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -241,6 +255,7 @@ std::unique_ptr rank( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to allocate any returned objects * @return sorted order of the segment sorted table * @@ -250,6 +265,7 @@ std::unique_ptr segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -262,6 +278,7 @@ std::unique_ptr stable_segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -306,6 +323,7 @@ std::unique_ptr stable_segmented_sorted_order( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to allocate any returned objects * @return table with elements in each segment sorted * @@ -316,6 +334,7 @@ std::unique_ptr
segmented_sort_by_key( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -329,6 +348,7 @@ std::unique_ptr
stable_segmented_sort_by_key( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index 2fed36862b9..c1aa8b294b3 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -43,19 +43,21 @@ namespace strings { * * @throw cudf::logic_error if start position is greater than stop position. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param start First character position to include in the search. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param start First character position to include in the search * @param stop Last position (exclusive) to include in the search. * Default of -1 will search to the end of the string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New integer column with character position values */ std::unique_ptr find( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, size_type start = 0, size_type stop = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -72,19 +74,21 @@ std::unique_ptr find( * * @throw cudf::logic_error if start position is greater than stop position. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param start First position to include in the search. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param start First position to include in the search * @param stop Last position (exclusive) to include in the search. * Default of -1 will search starting at the end of the string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New integer column with character position values */ std::unique_ptr rfind( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, size_type start = 0, size_type stop = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -123,37 +127,41 @@ std::unique_ptr find( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr contains( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a column of boolean values for each string where true indicates * the corresponding target string was found within that string in the provided column. * - * The 'output[i] = true` if string `targets[i]` is found inside `strings[i]` otherwise + * The 'output[i] = true` if string `targets[i]` is found inside `input[i]` otherwise * `output[i] = false`. * If `target[i]` is an empty string, true is returned for `output[i]`. * If `target[i]` is null, false is returned for `output[i]`. * - * Any null `strings[i]` row results in a null `output[i]` row. + * Any null string entries return corresponding null entries in the output columns. * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings column of targets to check row-wise in `strings`. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings column of targets to check row-wise in `strings` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr contains( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -166,14 +174,16 @@ std::unique_ptr contains( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * @return New type_id::BOOL8 column. */ std::unique_ptr starts_with( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -190,14 +200,16 @@ std::unique_ptr starts_with( * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr starts_with( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -210,14 +222,16 @@ std::unique_ptr starts_with( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr ends_with( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -234,14 +248,16 @@ std::unique_ptr ends_with( * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr ends_with( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group } // namespace strings diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 21cfdb15146..06b851c5012 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,14 +48,16 @@ namespace strings { * * @throw cudf::logic_error if `targets` is empty or contains nulls * - * @param input Strings instance for this operation. - * @param targets Strings to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return Lists column with character position values. + * @param input Strings instance for this operation + * @param targets Strings to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Lists column with character position values */ std::unique_ptr find_multiple( strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 745f0fc19ff..379b9624dc6 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -57,12 +57,14 @@ struct regex_program; * * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New lists column of strings */ std::unique_ptr findall( strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index cc8cac35ef4..c0932b81dc3 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -944,8 +944,10 @@ class dictionary_column_wrapper : public detail::column_wrapper { template dictionary_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{} { - wrapped = cudf::dictionary::encode( - fixed_width_column_wrapper(begin, end)); + wrapped = + cudf::dictionary::encode(fixed_width_column_wrapper(begin, end), + cudf::data_type{type_id::UINT32}, + cudf::test::get_default_stream()); } /** @@ -978,7 +980,9 @@ class dictionary_column_wrapper : public detail::column_wrapper { : column_wrapper{} { wrapped = cudf::dictionary::encode( - fixed_width_column_wrapper(begin, end, v)); + fixed_width_column_wrapper(begin, end, v), + cudf::data_type{type_id::UINT32}, + cudf::test::get_default_stream()); } /** @@ -1134,7 +1138,9 @@ class dictionary_column_wrapper : public detail::column_wrapper { template dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{} { - wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end)); + wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end), + cudf::data_type{type_id::UINT32}, + cudf::test::get_default_stream()); } /** @@ -1169,7 +1175,9 @@ class dictionary_column_wrapper : public detail::column_wrapper { dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v) : column_wrapper{} { - wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v)); + wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v), + cudf::data_type{type_id::UINT32}, + cudf::test::get_default_stream()); } /** diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 5d66401df9d..46f2c0e7bc9 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -47,19 +47,19 @@ namespace nvtext { * @throw cudf::logic_error if `separator` is invalid * @throw cudf::logic_error if there are not enough strings to generate any ngrams * - * @param strings Strings column to tokenize and produce ngrams from. - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to tokenize and produce ngrams from + * @param ngrams The ngram number to generate + * @param separator The string to use for separating ngram tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_ngrams( - cudf::strings_column_view const& strings, - cudf::size_type ngrams = 2, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::strings_column_view const& input, + cudf::size_type ngrams, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Generates ngrams of characters within each string. @@ -79,15 +79,17 @@ std::unique_ptr generate_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. * Default is 2 = bigram. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 2, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -113,14 +115,16 @@ std::unique_ptr generate_character_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values */ std::unique_ptr hash_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp index 17f20f7ea4c..9d76ef8689f 100644 --- a/cpp/include/nvtext/ngrams_tokenize.hpp +++ b/cpp/include/nvtext/ngrams_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,22 +66,22 @@ namespace nvtext { * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column to tokenize and produce ngrams from. - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. + * @param input Strings column to tokenize and produce ngrams from + * @param ngrams The ngram number to generate * @param delimiter UTF-8 characters used to separate each string into tokens. - * The default of empty string will separate tokens using whitespace. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * An empty string will separate tokens using whitespace. + * @param separator The string to use for separating ngram tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr ngrams_tokenize( - cudf::strings_column_view const& strings, - cudf::size_type ngrams = 2, - cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::strings_column_view const& input, + cudf::size_type ngrams, + cudf::string_scalar const& delimiter, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace nvtext diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index ab22c07e4d5..3973100aced 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -130,10 +130,11 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column std::unique_ptr add_keys(dictionary_column_view const& dictionary_column, column_view const& keys, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::add_keys(dictionary_column, keys, cudf::get_default_stream(), mr); + return detail::add_keys(dictionary_column, keys, stream, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu index 01411d06b62..fdf546b5875 100644 --- a/cpp/src/dictionary/decode.cu +++ b/cpp/src/dictionary/decode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,10 +65,11 @@ std::unique_ptr decode(dictionary_column_view const& source, } // namespace detail std::unique_ptr decode(dictionary_column_view const& source, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::decode(source, cudf::get_default_stream(), mr); + return detail::decode(source, stream, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index fe8e777b694..c92b57f0cac 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,10 +89,11 @@ data_type get_indices_type_for_size(size_type keys_size) std::unique_ptr encode(column_view const& input_column, data_type indices_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::encode(input_column, indices_type, cudf::get_default_stream(), mr); + return detail::encode(input_column, indices_type, stream, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 9fe4a63373b..86b70f1119b 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -195,17 +195,19 @@ std::unique_ptr remove_unused_keys(dictionary_column_view const& diction std::unique_ptr remove_keys(dictionary_column_view const& dictionary_column, column_view const& keys_to_remove, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::remove_keys(dictionary_column, keys_to_remove, cudf::get_default_stream(), mr); + return detail::remove_keys(dictionary_column, keys_to_remove, stream, mr); } std::unique_ptr remove_unused_keys(dictionary_column_view const& dictionary_column, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::remove_unused_keys(dictionary_column, cudf::get_default_stream(), mr); + return detail::remove_unused_keys(dictionary_column, stream, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu index 8e97a387780..e35aded1984 100644 --- a/cpp/src/dictionary/search.cu +++ b/cpp/src/dictionary/search.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,10 +79,8 @@ struct find_index_fn { using ScalarType = cudf::scalar_type_t; auto find_key = static_cast(key).value(stream); auto keys_view = column_device_view::create(input.keys(), stream); - auto iter = thrust::equal_range(rmm::exec_policy(cudf::get_default_stream()), - keys_view->begin(), - keys_view->end(), - find_key); + auto iter = thrust::equal_range( + rmm::exec_policy(stream), keys_view->begin(), keys_view->end(), find_key); return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, thrust::distance(keys_view->begin(), iter.first), @@ -176,10 +174,11 @@ std::unique_ptr get_insert_index(dictionary_column_view const& dictionar std::unique_ptr get_index(dictionary_column_view const& dictionary, scalar const& key, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::get_index(dictionary, key, cudf::get_default_stream(), mr); + return detail::get_index(dictionary, key, stream, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 36f5021d305..b49cf7850b1 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -241,17 +241,20 @@ std::pair>, std::vector> match_d std::unique_ptr set_keys(dictionary_column_view const& dictionary_column, column_view const& keys, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::set_keys(dictionary_column, keys, cudf::get_default_stream(), mr); + return detail::set_keys(dictionary_column, keys, stream, mr); } std::vector> match_dictionaries( - cudf::host_span input, rmm::mr::device_memory_resource* mr) + cudf::host_span input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::match_dictionaries(input, cudf::get_default_stream(), mr); + return detail::match_dictionaries(input, stream, mr); } } // namespace dictionary diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index 439b1c2d066..d773c2763df 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -16,12 +16,12 @@ #pragma once -#include -#include #include #include #include +#include +#include #include #include diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh index 87075a39ea3..183042fc0f4 100644 --- a/cpp/src/hash/unordered_multiset.cuh +++ b/cpp/src/hash/unordered_multiset.cuh @@ -16,11 +16,10 @@ #pragma once -#include - #include #include #include +#include #include #include diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 30cfee97fd8..e39625c92e7 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -419,6 +419,52 @@ std::unique_ptr get_column(arrow::Array const& array, : get_empty_type_column(array.length()); } +struct BuilderGenerator { + template && + !std::is_same_v)> + std::shared_ptr operator()(std::shared_ptr const& type) + { + return std::make_shared::BuilderType>( + type, arrow::default_memory_pool()); + } + + template || + std::is_same_v)> + std::shared_ptr operator()(std::shared_ptr const& type) + { + CUDF_FAIL("Type not supported by BuilderGenerator"); + } +}; + +std::shared_ptr make_builder(std::shared_ptr const& type) +{ + switch (type->id()) { + case arrow::Type::STRUCT: { + std::vector> field_builders; + + for (auto field : type->fields()) { + auto const vt = field->type(); + if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) { + field_builders.push_back(make_builder(vt)); + } else { + field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); + } + } + return std::make_shared( + type, arrow::default_memory_pool(), field_builders); + } + case arrow::Type::LIST: { + return std::make_shared(arrow::default_memory_pool(), + make_builder(type->field(0)->type())); + } + default: { + return arrow_type_dispatcher(*type, BuilderGenerator{}, type); + } + } +} + } // namespace std::unique_ptr
from_arrow(arrow::Table const& input_table, @@ -462,14 +508,54 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, return std::make_unique
(std::move(columns)); } +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Get a builder for the scalar type + auto builder = detail::make_builder(input.type); + + auto status = builder->AppendScalar(input); + if (status != arrow::Status::OK()) { + if (status.IsNotImplemented()) { + // The only known failure case here is for nulls + CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes", + std::invalid_argument); + } + CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); + } + + auto maybe_array = builder->Finish(); + if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } + auto array = *maybe_array; + + auto field = arrow::field("", input.type); + + auto table = arrow::Table::Make(arrow::schema({field}), {array}); + + auto cudf_table = detail::from_arrow(*table, stream, mr); + + auto cv = cudf_table->view().column(0); + return get_element(cv, 0, stream); +} + } // namespace detail std::unique_ptr
from_arrow(arrow::Table const& input_table, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_arrow(input_table, cudf::get_default_stream(), mr); + return detail::from_arrow(input_table, stream, mr); } +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow(input, stream, mr); +} } // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 958a2fcb95f..0cd750bc947 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -15,14 +15,16 @@ */ #include +#include #include +#include #include #include +#include #include #include #include #include -#include #include #include #include @@ -77,7 +79,10 @@ std::shared_ptr fetch_mask_buffer(column_view input_view, auto mask_buffer = allocate_arrow_bitmap(static_cast(input_view.size()), ar_mr); CUDF_CUDA_TRY(cudaMemcpyAsync( mask_buffer->mutable_data(), - (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(), + (input_view.offset() > 0) + ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource()) + .data() + : input_view.null_mask(), mask_size_in_bytes, cudaMemcpyDefault, stream.value())); @@ -139,29 +144,36 @@ struct dispatch_to_arrow { } }; -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) +// Convert decimal types from libcudf to arrow where those types are not +// directly supported by Arrow. These types must be fit into 128 bits, the +// smallest decimal resolution supported by Arrow. +template +std::shared_ptr unsupported_decimals_to_arrow(column_view input, + int32_t precision, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) { - using DeviceType = int64_t; - size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); auto count = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(cudf::get_default_stream()), - count, - count + input.size(), - [in = input.begin(), out = buf.data()] __device__(auto in_idx) { - auto const out_idx = in_idx * 2; - out[out_idx] = in[in_idx]; - out[out_idx + 1] = in[in_idx] < 0 ? -1 : 0; - }); + thrust::for_each( + rmm::exec_policy(cudf::get_default_stream()), + count, + count + input.size(), + [in = input.begin(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // The lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); @@ -169,7 +181,7 @@ std::shared_ptr dispatch_to_arrow::operator()( CUDF_CUDA_TRY(cudaMemcpyAsync( data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); - auto type = arrow::decimal(18, -input.type().scale()); + auto type = arrow::decimal(precision, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, std::move(data_buffer)}; auto data = std::make_shared(type, input.size(), buffers); @@ -177,6 +189,28 @@ std::shared_ptr dispatch_to_arrow::operator()( return std::make_shared(data); } +template <> +std::shared_ptr dispatch_to_arrow::operator()( + column_view input, + cudf::type_id, + column_metadata const&, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) +{ + return unsupported_decimals_to_arrow(input, 9, ar_mr, stream); +} + +template <> +std::shared_ptr dispatch_to_arrow::operator()( + column_view input, + cudf::type_id, + column_metadata const&, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) +{ + return unsupported_decimals_to_arrow(input, 18, ar_mr, stream); +} + template <> std::shared_ptr dispatch_to_arrow::operator()( column_view input, @@ -403,14 +437,37 @@ std::shared_ptr to_arrow(table_view input, return result; } + +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr) +{ + auto const column = cudf::make_column_from_scalar(input, 1, stream); + cudf::table_view const tv{{column->view()}}; + auto const arrow_table = cudf::to_arrow(tv, {metadata}, stream); + auto const ac = arrow_table->column(0); + auto const maybe_scalar = ac->GetScalar(0); + if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } + return maybe_scalar.ValueOrDie(); +} } // namespace detail std::shared_ptr to_arrow(table_view input, std::vector const& metadata, + rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) { CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); + return detail::to_arrow(input, metadata, stream, ar_mr); } +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr) +{ + CUDF_FUNC_RANGE(); + return detail::to_arrow(input, metadata, stream, ar_mr); +} } // namespace cudf diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index cabf904f020..5d7fb9d6b43 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -16,14 +16,13 @@ #include "nested_json.hpp" #include -#include +#include #include #include #include #include #include -#include #include #include #include @@ -331,23 +330,27 @@ std::vector copy_strings_to_host(device_span input, { CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); - rmm::device_uvector> string_views(num_strings, stream); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); thrust::transform(rmm::exec_policy(stream), d_offset_pairs, d_offset_pairs + num_strings, - string_views.begin(), - [data = input.data()] __device__(auto const& offsets) { + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { // Note: first character for non-field columns - return thrust::make_pair( - data + thrust::get<0>(offsets), + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); }); cudf::io::parse_options_view options_view{}; options_view.quotechar = '\0'; // no quotes options_view.keepquotes = true; - auto d_column_names = parse_data(string_views.begin(), + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, num_strings, data_type{type_id::STRING}, rmm::device_buffer{}, @@ -355,7 +358,7 @@ std::vector copy_strings_to_host(device_span input, options_view, stream, rmm::mr::get_current_device_resource()); - auto to_host = [stream](auto const& col) { + auto to_host = [stream](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); auto const h_chars = cudf::detail::make_std_vector_sync( @@ -763,19 +766,6 @@ std::pair, std::vector> device_json_co // TODO how about directly storing pair in json_column? auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); data_type target_type{}; @@ -790,12 +780,13 @@ std::pair, std::vector> device_json_co // Infer column type, if we don't have an explicit type for it else { target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, string_ranges_it, col_size, stream); + options.json_view(), d_input, offset_length_it, col_size, stream); } auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 9231040eb70..da5b0eedfbd 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -15,8 +15,6 @@ */ #include "nested_json.hpp" -#include -#include #include #include @@ -24,7 +22,9 @@ #include #include #include +#include #include +#include #include #include diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 0b49f97597d..06ac11485cb 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -19,14 +19,13 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #include #include #include @@ -1949,20 +1948,6 @@ std::pair, std::vector> json_column_to auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - data_type target_type{}; if (schema.has_value()) { @@ -1978,7 +1963,7 @@ std::pair, std::vector> json_column_to target_type = cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(), d_input, - string_ranges_it, + offset_length_it, col_size, stream); } @@ -1986,7 +1971,8 @@ std::pair, std::vector> json_column_to auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 1e44522ed33..2d363c51fce 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -20,6 +20,7 @@ */ #include +#include #include #include @@ -27,9 +28,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 0007530a5af..1d2262a1ccc 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -60,6 +60,7 @@ void rowgroup_char_counts(device_2dspan counts, auto const num_rowgroups = rowgroup_bounds.size().first; auto const num_str_cols = str_col_indexes.size(); + if (num_str_cols == 0) { return; } int block_size = 0; // suggested thread count to use int min_grid_size = 0; // minimum block count required diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index ae11af92f78..5c7b8ca3f8c 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -18,27 +18,474 @@ #include #include +#include #include namespace cudf { namespace io { namespace parquet { -uint8_t const CompactProtocolReader::g_list2struct[16] = {0, - 1, - 2, - ST_FLD_BYTE, - ST_FLD_DOUBLE, - 5, - ST_FLD_I16, - 7, - ST_FLD_I32, - 9, - ST_FLD_I64, - ST_FLD_BINARY, - ST_FLD_STRUCT, - ST_FLD_MAP, - ST_FLD_SET, - ST_FLD_LIST}; + +/** + * @brief Base class for parquet field functors. + * + * Holds the field value used by all of the specialized functors. + */ +class parquet_field { + private: + int _field_val; + + protected: + parquet_field(int f) : _field_val(f) {} + + public: + virtual ~parquet_field() = default; + int field() const { return _field_val; } +}; + +/** + * @brief Abstract base class for list functors. + */ +template +class parquet_field_list : public parquet_field { + private: + using read_func_type = std::function; + FieldType _expected_type; + read_func_type _read_value; + + protected: + std::vector& val; + + void bind_read_func(read_func_type fn) { _read_value = fn; } + + parquet_field_list(int f, std::vector& v, FieldType t) + : parquet_field(f), _expected_type(t), val(v) + { + } + + public: + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_LIST) { return true; } + auto const [t, n] = cpr->get_listh(); + if (t != _expected_type) { return true; } + val.resize(n); + for (uint32_t i = 0; i < n; i++) { + if (_read_value(i, cpr)) { return true; } + } + return false; + } +}; + +/** + * @brief Functor to set value to bool read from CompactProtocolReader + * + * bool doesn't actually encode a value, we just use the field type to indicate true/false + * + * @return True if field type is not bool + */ +class parquet_field_bool : public parquet_field { + bool& val; + + public: + parquet_field_bool(int f, bool& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) { return true; } + val = field_type == ST_FLD_TRUE; + return false; + } +}; + +/** + * @brief Functor to read a vector of booleans from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * bool fails + */ +struct parquet_field_bool_list : public parquet_field_list { + parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_TRUE) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + auto const current_byte = cpr->getb(); + if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) { return true; } + this->val[i] = current_byte == ST_FLD_TRUE; + return false; + }; + bind_read_func(read_value); + } +}; + +/** + * @brief Base type for a functor that reads an integer from CompactProtocolReader + * + * Assuming signed ints since the parquet spec does not use unsigned ints anywhere. + * + * @return True if there is a type mismatch + */ +template +class parquet_field_int : public parquet_field { + static constexpr bool is_byte = std::is_same_v; + + T& val; + + public: + parquet_field_int(int f, T& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if constexpr (is_byte) { + val = cpr->getb(); + } else { + val = cpr->get_zigzag(); + } + return (field_type != EXPECTED_TYPE); + } +}; + +using parquet_field_int8 = parquet_field_int; +using parquet_field_int32 = parquet_field_int; +using parquet_field_int64 = parquet_field_int; + +/** + * @brief Functor to read a vector of integers from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading an + * integer fails + */ +template +struct parquet_field_int_list : public parquet_field_list { + parquet_field_int_list(int f, std::vector& v) : parquet_field_list(f, v, EXPECTED_TYPE) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + this->val[i] = cpr->get_zigzag(); + return false; + }; + this->bind_read_func(read_value); + } +}; + +using parquet_field_int64_list = parquet_field_int_list; + +/** + * @brief Functor to read a string from CompactProtocolReader + * + * @return True if field type mismatches or if size of string exceeds bounds + * of the CompactProtocolReader + */ +class parquet_field_string : public parquet_field { + std::string& val; + + public: + parquet_field_string(int f, std::string& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_BINARY) { return true; } + auto const n = cpr->get_u32(); + if (n < static_cast(cpr->m_end - cpr->m_cur)) { + val.assign(reinterpret_cast(cpr->m_cur), n); + cpr->m_cur += n; + return false; + } else { + return true; + } + } +}; + +/** + * @brief Functor to read a vector of strings from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * string fails + */ +struct parquet_field_string_list : public parquet_field_list { + parquet_field_string_list(int f, std::vector& v) + : parquet_field_list(f, v, ST_FLD_BINARY) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + auto const l = cpr->get_u32(); + if (l < static_cast(cpr->m_end - cpr->m_cur)) { + this->val[i].assign(reinterpret_cast(cpr->m_cur), l); + cpr->m_cur += l; + } else { + return true; + } + return false; + }; + bind_read_func(read_value); + } +}; + +/** + * @brief Functor to set value to enum read from CompactProtocolReader + * + * @return True if field type is not int32 + */ +template +class parquet_field_enum : public parquet_field { + Enum& val; + + public: + parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + val = static_cast(cpr->get_i32()); + return (field_type != ST_FLD_I32); + } +}; + +/** + * @brief Functor to read a vector of enums from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading an + * enum fails + */ +template +struct parquet_field_enum_list : public parquet_field_list { + parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_I32) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + this->val[i] = static_cast(cpr->get_i32()); + return false; + }; + this->bind_read_func(read_value); + } +}; + +/** + * @brief Functor to read a structure from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * struct fails + */ +template +class parquet_field_struct : public parquet_field { + T& val; + + public: + parquet_field_struct(int f, T& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + return (field_type != ST_FLD_STRUCT || !(cpr->read(&val))); + } +}; + +/** + * @brief Functor to read optional structures in unions + * + * @return True if field types mismatch + */ +template +class parquet_field_union_struct : public parquet_field { + E& enum_val; + thrust::optional& val; // union structs are always wrapped in std::optional + + public: + parquet_field_union_struct(int f, E& ev, thrust::optional& v) + : parquet_field(f), enum_val(ev), val(v) + { + } + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + T v; + bool const res = parquet_field_struct(field(), v).operator()(cpr, field_type); + if (!res) { + val = v; + enum_val = static_cast(field()); + } + return res; + } +}; + +/** + * @brief Functor to read empty structures in unions + * + * Added to avoid having to define read() functions for empty structs contained in unions. + * + * @return True if field types mismatch + */ +template +class parquet_field_union_enumerator : public parquet_field { + E& val; + + public: + parquet_field_union_enumerator(int f, E& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { return true; } + cpr->skip_struct_field(field_type); + val = static_cast(field()); + return false; + } +}; + +/** + * @brief Functor to read a vector of structures from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * struct fails + */ +template +struct parquet_field_struct_list : public parquet_field_list { + parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_STRUCT) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + if (not cpr->read(&this->val[i])) { return true; } + return false; + }; + this->bind_read_func(read_value); + } +}; + +// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields +// in a struct +/** + * @brief Functor to read a union member from CompactProtocolReader + * + * @tparam is_empty True if tparam `T` type is empty type, else false. + * + * @return True if field types mismatch or if the process of reading a + * union member fails + */ +template +class ParquetFieldUnionFunctor : public parquet_field { + bool& is_set; + T& val; + + public: + ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { + return true; + } else { + is_set = true; + return !cpr->read(&val); + } + } +}; + +template +class ParquetFieldUnionFunctor : public parquet_field { + bool& is_set; + T& val; + + public: + ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { + return true; + } else { + is_set = true; + cpr->skip_struct_field(field_type); + return false; + } + } +}; + +template +ParquetFieldUnionFunctor> ParquetFieldUnion(int f, bool& b, T& v) +{ + return ParquetFieldUnionFunctor>(f, b, v); +} + +/** + * @brief Functor to read a binary from CompactProtocolReader + * + * @return True if field type mismatches or if size of binary exceeds bounds + * of the CompactProtocolReader + */ +class parquet_field_binary : public parquet_field { + std::vector& val; + + public: + parquet_field_binary(int f, std::vector& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_BINARY) { return true; } + auto const n = cpr->get_u32(); + if (n <= static_cast(cpr->m_end - cpr->m_cur)) { + val.resize(n); + val.assign(cpr->m_cur, cpr->m_cur + n); + cpr->m_cur += n; + return false; + } else { + return true; + } + } +}; + +/** + * @brief Functor to read a vector of binaries from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * binary fails + */ +struct parquet_field_binary_list : public parquet_field_list> { + parquet_field_binary_list(int f, std::vector>& v) + : parquet_field_list(f, v, ST_FLD_BINARY) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + auto const l = cpr->get_u32(); + if (l <= static_cast(cpr->m_end - cpr->m_cur)) { + val[i].resize(l); + val[i].assign(cpr->m_cur, cpr->m_cur + l); + cpr->m_cur += l; + } else { + return true; + } + return false; + }; + bind_read_func(read_value); + } +}; + +/** + * @brief Functor to read a struct from CompactProtocolReader + * + * @return True if field type mismatches + */ +class parquet_field_struct_blob : public parquet_field { + std::vector& val; + + public: + parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { return true; } + uint8_t const* const start = cpr->m_cur; + cpr->skip_struct_field(field_type); + if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } + return false; + } +}; + +/** + * @brief functor to wrap functors for optional fields + */ +template +class parquet_field_optional : public parquet_field { + thrust::optional& val; + + public: + parquet_field_optional(int f, thrust::optional& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + T v; + bool const res = FieldFunctor(field(), v).operator()(cpr, field_type); + if (!res) { val = v; } + return res; + } +}; /** * @brief Skips the number of bytes according to the specified struct type @@ -59,22 +506,21 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) case ST_FLD_BYTE: skip_bytes(1); break; case ST_FLD_DOUBLE: skip_bytes(8); break; case ST_FLD_BINARY: skip_bytes(get_u32()); break; - case ST_FLD_LIST: + case ST_FLD_LIST: [[fallthrough]]; case ST_FLD_SET: { - int c = getb(); - int n = c >> 4; - if (n == 0xf) n = get_i32(); - t = g_list2struct[c & 0xf]; - if (depth > 10) return false; - for (int32_t i = 0; i < n; i++) + auto const [t, n] = get_listh(); + if (depth > 10) { return false; } + for (uint32_t i = 0; i < n; i++) { skip_struct_field(t, depth + 1); + } } break; case ST_FLD_STRUCT: for (;;) { - int c = getb(); - t = c & 0xf; - if (!c) break; - if (depth > 10) return false; + int const c = getb(); + t = c & 0xf; + if (c == 0) { break; } // end of struct + if ((c & 0xf0) == 0) { get_i16(); } // field id is not a delta + if (depth > 10) { return false; } skip_struct_field(t, depth + 1); } break; @@ -125,11 +571,11 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple int field = 0; while (true) { int const current_byte = cpr->getb(); - if (!current_byte) break; - int const field_delta = current_byte >> 4; - int const field_type = current_byte & 0xf; - field = field_delta ? field + field_delta : cpr->get_i16(); - bool exit_function = FunctionSwitchImpl::run(cpr, field_type, field, op); + if (!current_byte) { break; } + int const field_delta = current_byte >> 4; + int const field_type = current_byte & 0xf; + field = field_delta ? field + field_delta : cpr->get_i16(); + bool const exit_function = FunctionSwitchImpl::run(cpr, field_type, field, op); if (exit_function) { return false; } } return true; @@ -137,27 +583,30 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple bool CompactProtocolReader::read(FileMetaData* f) { - auto op = std::make_tuple(ParquetFieldInt32(1, f->version), - ParquetFieldStructList(2, f->schema), - ParquetFieldInt64(3, f->num_rows), - ParquetFieldStructList(4, f->row_groups), - ParquetFieldStructList(5, f->key_value_metadata), - ParquetFieldString(6, f->created_by)); + using optional_list_column_order = + parquet_field_optional, parquet_field_struct_list>; + auto op = std::make_tuple(parquet_field_int32(1, f->version), + parquet_field_struct_list(2, f->schema), + parquet_field_int64(3, f->num_rows), + parquet_field_struct_list(4, f->row_groups), + parquet_field_struct_list(5, f->key_value_metadata), + parquet_field_string(6, f->created_by), + optional_list_column_order(7, f->column_orders)); return function_builder(this, op); } bool CompactProtocolReader::read(SchemaElement* s) { - auto op = std::make_tuple(ParquetFieldEnum(1, s->type), - ParquetFieldInt32(2, s->type_length), - ParquetFieldEnum(3, s->repetition_type), - ParquetFieldString(4, s->name), - ParquetFieldInt32(5, s->num_children), - ParquetFieldEnum(6, s->converted_type), - ParquetFieldInt32(7, s->decimal_scale), - ParquetFieldInt32(8, s->decimal_precision), - ParquetFieldOptionalInt32(9, s->field_id), - ParquetFieldStruct(10, s->logical_type)); + auto op = std::make_tuple(parquet_field_enum(1, s->type), + parquet_field_int32(2, s->type_length), + parquet_field_enum(3, s->repetition_type), + parquet_field_string(4, s->name), + parquet_field_int32(5, s->num_children), + parquet_field_enum(6, s->converted_type), + parquet_field_int32(7, s->decimal_scale), + parquet_field_int32(8, s->decimal_precision), + parquet_field_optional(9, s->field_id), + parquet_field_struct(10, s->logical_type)); return function_builder(this, op); } @@ -181,21 +630,21 @@ bool CompactProtocolReader::read(LogicalType* l) bool CompactProtocolReader::read(DecimalType* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->scale), ParquetFieldInt32(2, d->precision)); + auto op = std::make_tuple(parquet_field_int32(1, d->scale), parquet_field_int32(2, d->precision)); return function_builder(this, op); } bool CompactProtocolReader::read(TimeType* t) { auto op = - std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit)); + std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); return function_builder(this, op); } bool CompactProtocolReader::read(TimestampType* t) { auto op = - std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit)); + std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); return function_builder(this, op); } @@ -209,123 +658,129 @@ bool CompactProtocolReader::read(TimeUnit* u) bool CompactProtocolReader::read(IntType* i) { - auto op = std::make_tuple(ParquetFieldInt8(1, i->bitWidth), ParquetFieldBool(2, i->isSigned)); + auto op = std::make_tuple(parquet_field_int8(1, i->bitWidth), parquet_field_bool(2, i->isSigned)); return function_builder(this, op); } bool CompactProtocolReader::read(RowGroup* r) { - auto op = std::make_tuple(ParquetFieldStructList(1, r->columns), - ParquetFieldInt64(2, r->total_byte_size), - ParquetFieldInt64(3, r->num_rows)); + auto op = std::make_tuple(parquet_field_struct_list(1, r->columns), + parquet_field_int64(2, r->total_byte_size), + parquet_field_int64(3, r->num_rows)); return function_builder(this, op); } bool CompactProtocolReader::read(ColumnChunk* c) { - auto op = std::make_tuple(ParquetFieldString(1, c->file_path), - ParquetFieldInt64(2, c->file_offset), - ParquetFieldStruct(3, c->meta_data), - ParquetFieldInt64(4, c->offset_index_offset), - ParquetFieldInt32(5, c->offset_index_length), - ParquetFieldInt64(6, c->column_index_offset), - ParquetFieldInt32(7, c->column_index_length)); + auto op = std::make_tuple(parquet_field_string(1, c->file_path), + parquet_field_int64(2, c->file_offset), + parquet_field_struct(3, c->meta_data), + parquet_field_int64(4, c->offset_index_offset), + parquet_field_int32(5, c->offset_index_length), + parquet_field_int64(6, c->column_index_offset), + parquet_field_int32(7, c->column_index_length)); return function_builder(this, op); } bool CompactProtocolReader::read(ColumnChunkMetaData* c) { - auto op = std::make_tuple(ParquetFieldEnum(1, c->type), - ParquetFieldEnumList(2, c->encodings), - ParquetFieldStringList(3, c->path_in_schema), - ParquetFieldEnum(4, c->codec), - ParquetFieldInt64(5, c->num_values), - ParquetFieldInt64(6, c->total_uncompressed_size), - ParquetFieldInt64(7, c->total_compressed_size), - ParquetFieldInt64(9, c->data_page_offset), - ParquetFieldInt64(10, c->index_page_offset), - ParquetFieldInt64(11, c->dictionary_page_offset), - ParquetFieldStruct(12, c->statistics)); + auto op = std::make_tuple(parquet_field_enum(1, c->type), + parquet_field_enum_list(2, c->encodings), + parquet_field_string_list(3, c->path_in_schema), + parquet_field_enum(4, c->codec), + parquet_field_int64(5, c->num_values), + parquet_field_int64(6, c->total_uncompressed_size), + parquet_field_int64(7, c->total_compressed_size), + parquet_field_int64(9, c->data_page_offset), + parquet_field_int64(10, c->index_page_offset), + parquet_field_int64(11, c->dictionary_page_offset), + parquet_field_struct(12, c->statistics)); return function_builder(this, op); } bool CompactProtocolReader::read(PageHeader* p) { - auto op = std::make_tuple(ParquetFieldEnum(1, p->type), - ParquetFieldInt32(2, p->uncompressed_page_size), - ParquetFieldInt32(3, p->compressed_page_size), - ParquetFieldStruct(5, p->data_page_header), - ParquetFieldStruct(7, p->dictionary_page_header), - ParquetFieldStruct(8, p->data_page_header_v2)); + auto op = std::make_tuple(parquet_field_enum(1, p->type), + parquet_field_int32(2, p->uncompressed_page_size), + parquet_field_int32(3, p->compressed_page_size), + parquet_field_struct(5, p->data_page_header), + parquet_field_struct(7, p->dictionary_page_header), + parquet_field_struct(8, p->data_page_header_v2)); return function_builder(this, op); } bool CompactProtocolReader::read(DataPageHeader* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), - ParquetFieldEnum(2, d->encoding), - ParquetFieldEnum(3, d->definition_level_encoding), - ParquetFieldEnum(4, d->repetition_level_encoding)); + auto op = std::make_tuple(parquet_field_int32(1, d->num_values), + parquet_field_enum(2, d->encoding), + parquet_field_enum(3, d->definition_level_encoding), + parquet_field_enum(4, d->repetition_level_encoding)); return function_builder(this, op); } bool CompactProtocolReader::read(DictionaryPageHeader* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), - ParquetFieldEnum(2, d->encoding)); + auto op = std::make_tuple(parquet_field_int32(1, d->num_values), + parquet_field_enum(2, d->encoding)); return function_builder(this, op); } bool CompactProtocolReader::read(DataPageHeaderV2* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), - ParquetFieldInt32(2, d->num_nulls), - ParquetFieldInt32(3, d->num_rows), - ParquetFieldEnum(4, d->encoding), - ParquetFieldInt32(5, d->definition_levels_byte_length), - ParquetFieldInt32(6, d->repetition_levels_byte_length), - ParquetFieldBool(7, d->is_compressed)); + auto op = std::make_tuple(parquet_field_int32(1, d->num_values), + parquet_field_int32(2, d->num_nulls), + parquet_field_int32(3, d->num_rows), + parquet_field_enum(4, d->encoding), + parquet_field_int32(5, d->definition_levels_byte_length), + parquet_field_int32(6, d->repetition_levels_byte_length), + parquet_field_bool(7, d->is_compressed)); return function_builder(this, op); } bool CompactProtocolReader::read(KeyValue* k) { - auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value)); + auto op = std::make_tuple(parquet_field_string(1, k->key), parquet_field_string(2, k->value)); return function_builder(this, op); } bool CompactProtocolReader::read(PageLocation* p) { - auto op = std::make_tuple(ParquetFieldInt64(1, p->offset), - ParquetFieldInt32(2, p->compressed_page_size), - ParquetFieldInt64(3, p->first_row_index)); + auto op = std::make_tuple(parquet_field_int64(1, p->offset), + parquet_field_int32(2, p->compressed_page_size), + parquet_field_int64(3, p->first_row_index)); return function_builder(this, op); } bool CompactProtocolReader::read(OffsetIndex* o) { - auto op = std::make_tuple(ParquetFieldStructList(1, o->page_locations)); + auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations)); return function_builder(this, op); } bool CompactProtocolReader::read(ColumnIndex* c) { - auto op = std::make_tuple(ParquetFieldBoolList(1, c->null_pages), - ParquetFieldBinaryList(2, c->min_values), - ParquetFieldBinaryList(3, c->max_values), - ParquetFieldEnum(4, c->boundary_order), - ParquetFieldInt64List(5, c->null_counts)); + auto op = std::make_tuple(parquet_field_bool_list(1, c->null_pages), + parquet_field_binary_list(2, c->min_values), + parquet_field_binary_list(3, c->max_values), + parquet_field_enum(4, c->boundary_order), + parquet_field_int64_list(5, c->null_counts)); return function_builder(this, op); } bool CompactProtocolReader::read(Statistics* s) { - auto op = std::make_tuple(ParquetFieldBinary(1, s->max), - ParquetFieldBinary(2, s->min), - ParquetFieldInt64(3, s->null_count), - ParquetFieldInt64(4, s->distinct_count), - ParquetFieldBinary(5, s->max_value), - ParquetFieldBinary(6, s->min_value)); + auto op = std::make_tuple(parquet_field_binary(1, s->max), + parquet_field_binary(2, s->min), + parquet_field_int64(3, s->null_count), + parquet_field_int64(4, s->distinct_count), + parquet_field_binary(5, s->max_value), + parquet_field_binary(6, s->min_value)); + return function_builder(this, op); +} + +bool CompactProtocolReader::read(ColumnOrder* c) +{ + auto op = std::make_tuple(parquet_field_union_enumerator(1, c->type)); return function_builder(this, op); } @@ -338,7 +793,7 @@ bool CompactProtocolReader::read(Statistics* s) */ bool CompactProtocolReader::InitSchema(FileMetaData* md) { - if (static_cast(WalkSchema(md)) != md->schema.size()) return false; + if (static_cast(WalkSchema(md)) != md->schema.size()) { return false; } /* Inside FileMetaData, there is a std::vector of RowGroups and each RowGroup contains a * a std::vector of ColumnChunks. Each ColumnChunk has a member ColumnMetaData, which contains @@ -353,13 +808,15 @@ bool CompactProtocolReader::InitSchema(FileMetaData* md) for (auto const& path : column.meta_data.path_in_schema) { auto const it = [&] { // find_if starting at (current_schema_index + 1) and then wrapping - auto schema = [&](auto const& e) { return e.parent_idx == parent && e.name == path; }; - auto mid = md->schema.cbegin() + current_schema_index + 1; - auto it = std::find_if(mid, md->schema.cend(), schema); - if (it != md->schema.cend()) return it; + auto const schema = [&](auto const& e) { + return e.parent_idx == parent && e.name == path; + }; + auto const mid = md->schema.cbegin() + current_schema_index + 1; + auto const it = std::find_if(mid, md->schema.cend(), schema); + if (it != md->schema.cend()) { return it; } return std::find_if(md->schema.cbegin(), mid, schema); }(); - if (it == md->schema.cend()) return false; + if (it == md->schema.cend()) { return false; } current_schema_index = std::distance(md->schema.cbegin(), it); column.schema_idx = current_schema_index; parent = current_schema_index; @@ -401,9 +858,9 @@ int CompactProtocolReader::WalkSchema( if (e->num_children > 0) { for (int i = 0; i < e->num_children; i++) { e->children_idx.push_back(idx); - int idx_old = idx; - idx = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level); - if (idx <= idx_old) break; // Error + int const idx_old = idx; + idx = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level); + if (idx <= idx_old) { break; } // Error } } return idx; diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 62ccacaac37..619815db503 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace cudf { @@ -40,9 +41,6 @@ namespace parquet { * compression codecs are supported yet. */ class CompactProtocolReader { - protected: - static const uint8_t g_list2struct[16]; - public: explicit CompactProtocolReader(uint8_t const* base = nullptr, size_t len = 0) { init(base, len); } void init(uint8_t const* base, size_t len) @@ -57,45 +55,46 @@ class CompactProtocolReader { bytecnt = std::min(bytecnt, (size_t)(m_end - m_cur)); m_cur += bytecnt; } - uint32_t get_u32() noexcept + + // returns a varint encoded integer + template + T get_varint() noexcept { - uint32_t v = 0; + T v = 0; for (uint32_t l = 0;; l += 7) { - uint32_t c = getb(); + T c = getb(); v |= (c & 0x7f) << l; - if (c < 0x80) break; + if (c < 0x80) { break; } } return v; } - uint64_t get_u64() noexcept - { - uint64_t v = 0; - for (uint64_t l = 0;; l += 7) { - uint64_t c = getb(); - v |= (c & 0x7f) << l; - if (c < 0x80) break; - } - return v; - } - int32_t get_i16() noexcept { return get_i32(); } - int32_t get_i32() noexcept - { - uint32_t u = get_u32(); - return (int32_t)((u >> 1u) ^ -(int32_t)(u & 1)); - } - int64_t get_i64() noexcept + + // returns a zigzag encoded signed integer + template + T get_zigzag() noexcept { - uint64_t u = get_u64(); - return (int64_t)((u >> 1u) ^ -(int64_t)(u & 1)); + using U = std::make_unsigned_t; + U const u = get_varint(); + return static_cast((u >> 1u) ^ -static_cast(u & 1)); } - int32_t get_listh(uint8_t* el_type) noexcept + + // thrift spec says to use zigzag i32 for i16 types + int32_t get_i16() noexcept { return get_zigzag(); } + int32_t get_i32() noexcept { return get_zigzag(); } + int64_t get_i64() noexcept { return get_zigzag(); } + + uint32_t get_u32() noexcept { return get_varint(); } + uint64_t get_u64() noexcept { return get_varint(); } + + [[nodiscard]] std::pair get_listh() noexcept { - uint32_t c = getb(); - int32_t sz = c >> 4; - *el_type = c & 0xf; - if (sz == 0xf) sz = get_u32(); - return sz; + uint32_t const c = getb(); + uint32_t sz = c >> 4; + uint8_t t = c & 0xf; + if (sz == 0xf) { sz = get_u32(); } + return {t, sz}; } + bool skip_struct_field(int t, int depth = 0); public: @@ -120,6 +119,7 @@ class CompactProtocolReader { bool read(OffsetIndex* o); bool read(ColumnIndex* c); bool read(Statistics* s); + bool read(ColumnOrder* c); public: static int NumRequiredBits(uint32_t max_level) noexcept @@ -140,523 +140,11 @@ class CompactProtocolReader { uint8_t const* m_cur = nullptr; uint8_t const* m_end = nullptr; - friend class ParquetFieldBool; - friend class ParquetFieldBoolList; - friend class ParquetFieldInt8; - friend class ParquetFieldInt32; - friend class ParquetFieldOptionalInt32; - friend class ParquetFieldInt64; - friend class ParquetFieldInt64List; - template - friend class ParquetFieldStructListFunctor; - friend class ParquetFieldString; - template - friend class ParquetFieldStructFunctor; - template - friend class ParquetFieldUnionFunctor; - template - friend class ParquetFieldEnum; - template - friend class ParquetFieldEnumListFunctor; - friend class ParquetFieldStringList; - friend class ParquetFieldBinary; - friend class ParquetFieldBinaryList; - friend class ParquetFieldStructBlob; -}; - -/** - * @brief Functor to set value to bool read from CompactProtocolReader - * - * @return True if field type is not bool - */ -class ParquetFieldBool { - int field_val; - bool& val; - - public: - ParquetFieldBool(int f, bool& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - return (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) || - !(val = (field_type == ST_FLD_TRUE), true); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of booleans from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * bool fails - */ -class ParquetFieldBoolList { - int field_val; - std::vector& val; - - public: - ParquetFieldBoolList(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_TRUE) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - unsigned int current_byte = cpr->getb(); - if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) return true; - val[i] = current_byte == ST_FLD_TRUE; - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to 8 bit integer read from CompactProtocolReader - * - * @return True if field type is not int8 - */ -class ParquetFieldInt8 { - int field_val; - int8_t& val; - - public: - ParquetFieldInt8(int f, int8_t& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->getb(); - return (field_type != ST_FLD_BYTE); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to 32 bit integer read from CompactProtocolReader - * - * @return True if field type is not int32 - */ -class ParquetFieldInt32 { - int field_val; - int32_t& val; - - public: - ParquetFieldInt32(int f, int32_t& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->get_i32(); - return (field_type != ST_FLD_I32); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader - * - * @return True if field type is not int32 - */ -class ParquetFieldOptionalInt32 { - int field_val; - std::optional& val; - - public: - ParquetFieldOptionalInt32(int f, std::optional& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->get_i32(); - return (field_type != ST_FLD_I32); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to 64 bit integer read from CompactProtocolReader - * - * @return True if field type is not int32 or int64 - */ -class ParquetFieldInt64 { - int field_val; - int64_t& val; - - public: - ParquetFieldInt64(int f, int64_t& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->get_i64(); - return (field_type < ST_FLD_I16 || field_type > ST_FLD_I64); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of 64-bit integers from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading an - * int64 fails - */ -class ParquetFieldInt64List { - int field_val; - std::vector& val; - - public: - ParquetFieldInt64List(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_I64) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - val[i] = cpr->get_i64(); - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of structures from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * struct fails - */ -template -class ParquetFieldStructListFunctor { - int field_val; - std::vector& val; - - public: - ParquetFieldStructListFunctor(int f, std::vector& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - - int current_byte = cpr->getb(); - if ((current_byte & 0xf) != ST_FLD_STRUCT) return true; - int n = current_byte >> 4; - if (n == 0xf) n = cpr->get_u32(); - val.resize(n); - for (int32_t i = 0; i < n; i++) { - if (!(cpr->read(&val[i]))) { return true; } - } - - return false; - } - - int field() { return field_val; } -}; - -template -ParquetFieldStructListFunctor ParquetFieldStructList(int f, std::vector& v) -{ - return ParquetFieldStructListFunctor(f, v); -} - -/** - * @brief Functor to read a string from CompactProtocolReader - * - * @return True if field type mismatches or if size of string exceeds bounds - * of the CompactProtocolReader - */ -class ParquetFieldString { - int field_val; - std::string& val; - - public: - ParquetFieldString(int f, std::string& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_BINARY) return true; - uint32_t n = cpr->get_u32(); - if (n < (size_t)(cpr->m_end - cpr->m_cur)) { - val.assign((char const*)cpr->m_cur, n); - cpr->m_cur += n; - return false; - } else { - return true; - } - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a structure from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * struct fails - */ -template -class ParquetFieldStructFunctor { - int field_val; - T& val; - - public: - ParquetFieldStructFunctor(int f, T& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - return (field_type != ST_FLD_STRUCT || !(cpr->read(&val))); - } - - int field() { return field_val; } -}; - -template -ParquetFieldStructFunctor ParquetFieldStruct(int f, T& v) -{ - return ParquetFieldStructFunctor(f, v); -} - -/** - * @brief Functor to read a union member from CompactProtocolReader - * - * @tparam is_empty True if tparam `T` type is empty type, else false. - * - * @return True if field types mismatch or if the process of reading a - * union member fails - */ -template -class ParquetFieldUnionFunctor { - int field_val; - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - return !cpr->read(&val); - } - } - - int field() { return field_val; } -}; - -template -struct ParquetFieldUnionFunctor { - int field_val; - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - cpr->skip_struct_field(field_type); - return false; - } - } - - int field() { return field_val; } -}; - -template -ParquetFieldUnionFunctor> ParquetFieldUnion(int f, bool& b, T& v) -{ - return ParquetFieldUnionFunctor>(f, b, v); -} - -/** - * @brief Functor to set value to enum read from CompactProtocolReader - * - * @return True if field type is not int32 - */ -template -class ParquetFieldEnum { - int field_val; - Enum& val; - - public: - ParquetFieldEnum(int f, Enum& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = static_cast(cpr->get_i32()); - return (field_type != ST_FLD_I32); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of enums from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading an - * enum fails - */ -template -class ParquetFieldEnumListFunctor { - int field_val; - std::vector& val; - - public: - ParquetFieldEnumListFunctor(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - int current_byte = cpr->getb(); - if ((current_byte & 0xf) != ST_FLD_I32) return true; - int n = current_byte >> 4; - if (n == 0xf) n = cpr->get_u32(); - val.resize(n); - for (int32_t i = 0; i < n; i++) { - val[i] = static_cast(cpr->get_i32()); - } - return false; - } - - int field() { return field_val; } -}; - -template -ParquetFieldEnumListFunctor ParquetFieldEnumList(int field, std::vector& v) -{ - return ParquetFieldEnumListFunctor(field, v); -} - -/** - * @brief Functor to read a vector of strings from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * string fails - */ -class ParquetFieldStringList { - int field_val; - std::vector& val; - - public: - ParquetFieldStringList(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_BINARY) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - uint32_t l = cpr->get_u32(); - if (l < (size_t)(cpr->m_end - cpr->m_cur)) { - val[i].assign((char const*)cpr->m_cur, l); - cpr->m_cur += l; - } else - return true; - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a binary from CompactProtocolReader - * - * @return True if field type mismatches or if size of binary exceeds bounds - * of the CompactProtocolReader - */ -class ParquetFieldBinary { - int field_val; - std::vector& val; - - public: - ParquetFieldBinary(int f, std::vector& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_BINARY) return true; - uint32_t n = cpr->get_u32(); - if (n <= (size_t)(cpr->m_end - cpr->m_cur)) { - val.resize(n); - val.assign(cpr->m_cur, cpr->m_cur + n); - cpr->m_cur += n; - return false; - } else { - return true; - } - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of binaries from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * binary fails - */ -class ParquetFieldBinaryList { - int field_val; - std::vector>& val; - - public: - ParquetFieldBinaryList(int f, std::vector>& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_BINARY) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - uint32_t l = cpr->get_u32(); - if (l <= (size_t)(cpr->m_end - cpr->m_cur)) { - val[i].resize(l); - val[i].assign(cpr->m_cur, cpr->m_cur + l); - cpr->m_cur += l; - } else - return true; - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a struct from CompactProtocolReader - * - * @return True if field type mismatches - */ -class ParquetFieldStructBlob { - int field_val; - std::vector& val; - - public: - ParquetFieldStructBlob(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) return true; - uint8_t const* start = cpr->m_cur; - cpr->skip_struct_field(field_type); - if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } - return false; - } - - int field() { return field_val; } + friend class parquet_field_string; + friend class parquet_field_string_list; + friend class parquet_field_binary; + friend class parquet_field_binary_list; + friend class parquet_field_struct_blob; }; } // namespace parquet diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index b2c0c97c52d..60bc8984d81 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -33,18 +33,7 @@ size_t CompactProtocolWriter::write(FileMetaData const& f) c.field_struct_list(4, f.row_groups); if (not f.key_value_metadata.empty()) { c.field_struct_list(5, f.key_value_metadata); } if (not f.created_by.empty()) { c.field_string(6, f.created_by); } - if (f.column_order_listsize != 0) { - // Dummy list of struct containing an empty field1 struct - c.put_field_header(7, c.current_field(), ST_FLD_LIST); - c.put_byte((uint8_t)((std::min(f.column_order_listsize, 0xfu) << 4) | ST_FLD_STRUCT)); - if (f.column_order_listsize >= 0xf) c.put_uint(f.column_order_listsize); - for (uint32_t i = 0; i < f.column_order_listsize; i++) { - c.put_field_header(1, 0, ST_FLD_STRUCT); - c.put_byte(0); // ColumnOrder.field1 struct end - c.put_byte(0); // ColumnOrder struct end - } - c.set_current_field(7); - } + if (f.column_orders.has_value()) { c.field_struct_list(7, f.column_orders.value()); } return c.value(); } @@ -233,6 +222,16 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s) return c.value(); } +size_t CompactProtocolWriter::write(ColumnOrder const& co) +{ + CompactProtocolFieldWriter c(*this); + switch (co) { + case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break; + default: break; + } + return c.value(); +} + void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); } void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len) @@ -320,6 +319,13 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) current_field_value = field; } +inline void CompactProtocolFieldWriter::field_empty_struct(int field) +{ + put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_byte(0); // add a stop field + current_field_value = field; +} + template inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector const& val) { diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 8d7b0961934..26d66527aa5 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -53,6 +53,7 @@ class CompactProtocolWriter { size_t write(Statistics const&); size_t write(PageLocation const&); size_t write(OffsetIndex const&); + size_t write(ColumnOrder const&); protected: std::vector& m_buf; @@ -94,6 +95,8 @@ class CompactProtocolFieldWriter { template inline void field_struct(int field, T const& val); + inline void field_empty_struct(int field); + template inline void field_struct_list(int field, std::vector const& val); diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index f7318bb9935..c2affc774c2 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -18,6 +18,8 @@ #include "parquet_common.hpp" +#include + #include #include #include @@ -118,6 +120,16 @@ struct LogicalType { BsonType BSON; }; +/** + * Union to specify the order used for the min_value and max_value fields for a column. + */ +struct ColumnOrder { + enum Type { UNDEFINED, TYPE_ORDER }; + Type type; + + operator Type() const { return type; } +}; + /** * @brief Struct for describing an element/field in the Parquet format schema * @@ -135,7 +147,7 @@ struct SchemaElement { int32_t num_children = 0; int32_t decimal_scale = 0; int32_t decimal_precision = 0; - std::optional field_id = std::nullopt; + thrust::optional field_id = thrust::nullopt; bool output_as_byte_array = false; // The following fields are filled in later during schema initialization @@ -284,8 +296,8 @@ struct FileMetaData { int64_t num_rows = 0; std::vector row_groups; std::vector key_value_metadata; - std::string created_by = ""; - uint32_t column_order_listsize = 0; + std::string created_by = ""; + thrust::optional> column_orders; }; /** diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5f8f1617cb9..5a1716bb547 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -141,7 +141,7 @@ enum BoundaryOrder { /** * @brief Thrift compact protocol struct field types */ -enum { +enum FieldType { ST_FLD_TRUE = 1, ST_FLD_FALSE = 2, ST_FLD_BYTE = 3, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index d2976a3f5d9..a124f352ee4 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -74,8 +74,11 @@ struct aggregate_writer_metadata { for (size_t i = 0; i < partitions.size(); ++i) { this->files[i].num_rows = partitions[i].num_rows; } - this->column_order_listsize = - (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_columns : 0; + + if (stats_granularity != statistics_freq::STATISTICS_NONE) { + ColumnOrder default_order = {ColumnOrder::TYPE_ORDER}; + this->column_orders = std::vector(num_columns, default_order); + } for (size_t p = 0; p < kv_md.size(); ++p) { std::transform(kv_md[p].begin(), @@ -102,13 +105,13 @@ struct aggregate_writer_metadata { { CUDF_EXPECTS(part < files.size(), "Invalid part index queried"); FileMetaData meta{}; - meta.version = this->version; - meta.schema = this->schema; - meta.num_rows = this->files[part].num_rows; - meta.row_groups = this->files[part].row_groups; - meta.key_value_metadata = this->files[part].key_value_metadata; - meta.created_by = this->created_by; - meta.column_order_listsize = this->column_order_listsize; + meta.version = this->version; + meta.schema = this->schema; + meta.num_rows = this->files[part].num_rows; + meta.row_groups = this->files[part].row_groups; + meta.key_value_metadata = this->files[part].key_value_metadata; + meta.created_by = this->created_by; + meta.column_orders = this->column_orders; return meta; } @@ -170,8 +173,8 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string created_by = ""; - uint32_t column_order_listsize = 0; + std::string created_by = ""; + thrust::optional> column_orders = thrust::nullopt; }; namespace { @@ -2373,20 +2376,7 @@ std::unique_ptr> writer::merge_row_group_metadata( md.num_rows += tmp.num_rows; } } - // Reader doesn't currently populate column_order, so infer it here - if (not md.row_groups.empty()) { - auto const is_valid_stats = [](auto const& stats) { - return not stats.max.empty() || not stats.min.empty() || stats.null_count != -1 || - stats.distinct_count != -1 || not stats.max_value.empty() || - not stats.min_value.empty(); - }; - uint32_t num_columns = static_cast(md.row_groups[0].columns.size()); - md.column_order_listsize = - (num_columns > 0 && is_valid_stats(md.row_groups[0].columns[0].meta_data.statistics)) - ? num_columns - : 0; - } // Thrift-encode the resulting output file_header_s fhdr; file_ender_s fendr; diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu new file mode 100644 index 00000000000..d16237d7afe --- /dev/null +++ b/cpp/src/io/utilities/data_casting.cu @@ -0,0 +1,990 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cudf::io::json::detail { + +constexpr auto SINGLE_THREAD_THRESHOLD = 128; +constexpr auto WARP_THRESHOLD = 128 * 128; // 16K + +// Unicode code point escape sequence +static constexpr char UNICODE_SEQ = 0x7F; + +// Invalid escape sequence +static constexpr char NON_ESCAPE_CHAR = 0x7E; + +// Unicode code point escape sequence prefix comprises '\' and 'u' characters +static constexpr size_type UNICODE_ESC_PREFIX = 2; + +// Unicode code point escape sequence comprises four hex characters +static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4; + +// A unicode code point escape sequence is \uXXXX +static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT; + +static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800; +static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; + +/** + * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or + * whether type casting failed. + */ +enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE }; + +/** + * @brief Providing additional information about the type casting result. + */ +struct data_casting_result_info { + // Number of bytes written to output + size_type bytes; + // Whether parsing succeeded, item was parsed to null, or failed + data_casting_result result; +}; + +/** + * @brief Returns the character to output for a given escaped character that's following a + * backslash. + * + * @param escaped_char The character following the backslash. + * @return The character to output for a given character that's following a backslash + */ +__device__ __forceinline__ char get_escape_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return '"'; + case '\\': return '\\'; + case '/': return '/'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'u': return UNICODE_SEQ; + default: return NON_ESCAPE_CHAR; + } +} + +/** + * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence + * \uXXXX. + * + * @param str Pointer to the first (most-significant) hex digit + * @return The parsed hex value if successful, -1 otherwise. + */ +__device__ __forceinline__ int32_t parse_unicode_hex(char const* str) +{ + // Prepare result + int32_t result = 0, base = 1; + constexpr int32_t hex_radix = 16; + + // Iterate over hex digits right-to-left + size_type index = UNICODE_HEX_DIGIT_COUNT; + while (index-- > 0) { + char const ch = str[index]; + if (ch >= '0' && ch <= '9') { + result += static_cast((ch - '0') + 0) * base; + base *= hex_radix; + } else if (ch >= 'A' && ch <= 'F') { + result += static_cast((ch - 'A') + 10) * base; + base *= hex_radix; + } else if (ch >= 'a' && ch <= 'f') { + result += static_cast((ch - 'a') + 10) * base; + base *= hex_radix; + } else { + return -1; + } + } + return result; +} + +/** + * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to + * \p out_it + */ +constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) +{ + auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) + : strings::detail::from_char_utf8(character, out_it); + if (out_it) out_it += bytes; + return bytes; +} + +/** + * @brief Processes a string, replaces escape sequences and optionally strips off the quote + * characters. + * + * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to + * char + * @param in_begin Iterator to the first item to process + * @param in_end Iterator to one past the last item to process + * @param d_buffer Output character buffer to the first item to write + * @param options Settings for controlling string processing behavior + * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is + * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether + * parsing succeeded, item was parsed to null, or failed. + */ +template +__device__ __forceinline__ data_casting_result_info +process_string(in_iterator_t in_begin, + in_iterator_t in_end, + char* d_buffer, + cudf::io::parse_options_view const& options) +{ + int32_t bytes = 0; + auto const num_in_chars = thrust::distance(in_begin, in_end); + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + + // Copy literal/numeric value + if (not is_string_value) { + bytes += (in_end - in_begin); + if (d_buffer) d_buffer = thrust::copy(thrust::seq, in_begin, in_end, d_buffer); + return {bytes, data_casting_result::PARSING_SUCCESS}; + } + char constexpr backslash_char = '\\'; + + // Escape-flag, set after encountering a backslash character + bool is_prev_char_escape = false; + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + + // Iterate over the input + while (in_begin != in_end) { + // Copy single character to output + if (!is_prev_char_escape) { + is_prev_char_escape = (*in_begin == backslash_char); + if (!is_prev_char_escape) { + if (d_buffer) *d_buffer++ = *in_begin; + ++bytes; + } + ++in_begin; + continue; + } + + // Previous char indicated beginning of escape sequence + // Reset escape flag for next loop iteration + is_prev_char_escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(*in_begin); + + // We escaped an invalid escape character -> "fail"/null for this item + if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; } + + // Regular, single-character escape + if (escaped_char != UNICODE_SEQ) { + if (d_buffer) *d_buffer++ = escaped_char; + ++bytes; + ++in_begin; + continue; + } + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + // Skip over the 'u' char from \uXXXX to the first hex digit + ++in_begin; + + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) { + return {bytes, data_casting_result::PARSING_FAILURE}; + } + + auto hex_val = parse_unicode_hex(in_begin); + + // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item + if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; } + + // Skip over the four hex digits + thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + int32_t hex_low_val = 0; + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && + *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { + // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low + // surrogate + hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2)); + } + + // This is indeed a UTF16 surrogate pair + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Skip over the second \uXXXX sequence + thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS); + + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + bytes += write_utf8_char(utf8_chars, d_buffer); + } else { + // Just a single \uXXXX sequence + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + bytes += write_utf8_char(utf8_chars, d_buffer); + } + } + + // The last character of the input is a backslash -> "fail"/null for this item + if (is_prev_char_escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } + return {bytes, data_casting_result::PARSING_SUCCESS}; +} + +/** + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * warp. + * + * @tparam num_warps number of warps in the block + */ +template +struct bitfield_warp { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. + // 5+32 for each warp. + bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size]; + + /// Sets all bits to 0 + __device__ void reset(unsigned warp_id) + { + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) { + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0; + } + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0; + } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = + is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size]; + __syncwarp(); + } + + /// Each thread in a warp sets its own bit. + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] = + is_escaping_backslash; + __syncwarp(); + } + + /// Each thread in a warp gets the requested bit. + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index]; + } +}; + +/** + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * block. + * + * @tparam num_warps number of warps in the block + */ +template +struct bitfield_block { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. + // 5 + num_warps*32 for entire block + bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size]; + + /// Sets all bits to 0 + __device__ void reset(unsigned warp_id) + { + if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; } + is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0; + } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x < UNICODE_LOOK_BACK) + is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x]; + __syncthreads(); + } + + /// Each thread in a block sets its own bit. + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash; + __syncthreads(); + } + + /// Each thread in a block gets the requested bit. + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[UNICODE_LOOK_BACK + bit_index]; + } +}; + +// Algorithm: warp/block parallel version of string_parse and process_string() +// Decoding character classes (u8, u16, \*, *): +// character count: input->output +// \uXXXX 6->2/3/4 +// \uXXXX\uXXXX 12->2/3/4 +// \" 2->1 +// * 1->1 +// +// ERROR conditions. (all collaborating threads quit) +// c=='\' & curr_idx == end_idx-1; +// [c-1]=='\' & get_escape[c]==NEC +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex +// +// skip conditions. (current thread skips this char, no output) +// c=='\' skip. (Escaping char only) +// [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip. +// +// write conditions. (write to d_buffer) +// [c-1]!='\' & [c]!='\' write [c] +// [c-1]!='\' & [c]=='\' skip (already covered in skip conditions) +// [c-1]=='\' & [c]!=NEC && [c]!=UNICODE_SEQ, write [c] +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE +// [c+1:4]=curr_hex_val +// // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c+7:4]=next_hex_val +// // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val +// // if prev_hex_val in high, curr_hex_val in low, skip. +// // if curr_hex_val in high, next_hex_val in low, write [u16] +// if curr_hex_val not in high, write [u8] +// before writing, find num of output characters per threads, +// then do intra-warp/intra-block scan for out_idx +// propagate offset from next iteration to carry forward. +// Uses 1 warp per string or 1 block per string + +/** + * @brief Warp/Block parallel version of string_parse functor + * + * @tparam is_warp True if 1 warp per string, False if 1 block per string + * @tparam num_warps Number of warps per block + * @tparam str_tuple_it Iterator type for tuple with string pointer and its length + * @param str_tuples iterator of tuple with string pointer and its length + * @param total_out_strings Number of string rows to be processed + * @param str_counter Counter to keep track of processed number of strings + * @param null_mask Null mask + * @param null_count_data pointer to store null count + * @param options Settings for controlling string processing behavior + * @param d_offsets Offsets to identify where to store the results for each string + * @param d_chars Character array to store the characters of strings + */ +template +__global__ void parse_fn_string_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + size_type* str_counter, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) +{ + constexpr auto BLOCK_SIZE = + is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps; + size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x; + + // get 1-string index per warp/block + auto get_next_string = [&]() { + if constexpr (is_warp) { + size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + return __shfl_sync(0xffffffff, istring, 0); + } else { + // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's + // istring value + __syncthreads(); + __shared__ size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + __syncthreads(); + return istring; + } + }; + // grid-stride loop. + for (size_type istring = get_next_string(); istring < total_out_strings; + istring = get_next_string()) { + // skip nulls + if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { + if (!d_chars && lane == 0) d_offsets[istring] = 0; + continue; // gride-stride return; + } + + auto in_begin = str_tuples[istring].first; + auto in_end = in_begin + str_tuples[istring].second; + auto const num_in_chars = str_tuples[istring].second; + if constexpr (is_warp) { + if (num_in_chars <= SINGLE_THREAD_THRESHOLD or num_in_chars > WARP_THRESHOLD) continue; + } else { + if (num_in_chars <= WARP_THRESHOLD) continue; + } + + // Check if the value corresponds to the null literal + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + if (lane == 0) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[istring] = 0; + } + continue; // gride-stride return; + } + } + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr; + + // Copy literal/numeric value + if (not is_string_value) { + if (!d_chars) { + if (lane == 0) { d_offsets[istring] = in_end - in_begin; } + } else { + for (thread_index_type char_index = lane; char_index < (in_end - in_begin); + char_index += BLOCK_SIZE) { + d_buffer[char_index] = in_begin[char_index]; + } + } + continue; // gride-stride return; + } + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + // warp-parallelized or block-parallelized process_string() + + auto is_hex = [](auto ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); + }; + + // for backslash scan calculation: is_previous_escaping_backslash + [[maybe_unused]] auto warp_id = threadIdx.x / cudf::detail::warp_size; + bool init_state_reg; + __shared__ bool init_state_shared; + size_type last_offset_reg; + __shared__ size_type last_offset_shared; + bool& init_state(is_warp ? init_state_reg : init_state_shared); + size_type& last_offset(is_warp ? last_offset_reg : last_offset_shared); + if (is_warp || lane == 0) { + init_state = false; + last_offset = 0; + } + using bitfield = + std::conditional_t, bitfield_block>; + __shared__ bitfield is_slash; + is_slash.reset(warp_id); + __syncthreads(); + // 0-31, 32-63, ... i*32-n. + // entire warp executes but with mask. + for (thread_index_type char_index = lane; + char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); + char_index += BLOCK_SIZE) { + bool const is_within_bounds = char_index < (in_end - in_begin); + auto const MASK = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff; + auto const c = is_within_bounds ? in_begin[char_index] : '\0'; + auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; + auto const escaped_char = get_escape_char(c); + + bool is_escaping_backslash{false}; + [[maybe_unused]] bool is_prev_escaping_backslash{false}; + // To check current is backslash by checking if previous is backslash. + // curr = !prev & c=='\\' + // So, scan is required from beginning of string. + // State table approach (intra-warp FST) (intra-block FST) + // 2 states: Not-Slash(NS), Slash(S). + // prev / * + // NS S NS + // S NS NS + // After inclusive scan, all current S states translate to escaping backslash. + // All escaping backslash should be skipped. + + struct state_table { + // using bit fields instead of state[2] + bool state0 : 1; + bool state1 : 1; + bool inline __device__ get(bool init_state) const { return init_state ? state1 : state0; } + }; + state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. + auto composite_op = [](state_table op1, state_table op2) { + // equivalent of state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + return state_table{op1.state0 ? op2.state1 : op2.state0, + op1.state1 ? op2.state1 : op2.state0}; + }; + state_table scanned; + // inclusive scan of escaping backslashes + if constexpr (is_warp) { + using SlashScan = cub::WarpScan; + __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; + SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.get(init_state); + init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); + __syncwarp(); + is_slash.shift(warp_id); + is_slash.set_bits(warp_id, is_escaping_backslash); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); + } else { + using SlashScan = cub::BlockScan; + __shared__ typename SlashScan::TempStorage temp_slash; + SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.get(init_state); + __syncthreads(); + if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; + __syncthreads(); + is_slash.shift(warp_id); + is_slash.set_bits(warp_id, is_escaping_backslash); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); + // There is another __syncthreads() at the end of for-loop. + } + + // String with parsing errors are made as null + bool error = false; + if (is_within_bounds) { + // curr=='\' and end, or prev=='\' and curr=='u' and end-curr < UNICODE_HEX_DIGIT_COUNT + // or prev=='\' and curr=='u' and end-curr >= UNICODE_HEX_DIGIT_COUNT and any non-hex + error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1); + error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR); + error |= (is_prev_escaping_backslash && c == 'u' && + ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | + !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | + !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); + } + // Make sure all threads have no errors before continuing + if constexpr (is_warp) { + error = __any_sync(MASK, error); + } else { + using ErrorReduce = cub::BlockReduce; + __shared__ typename ErrorReduce::TempStorage temp_storage_error; + __shared__ bool error_reduced; + error_reduced = ErrorReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. + // only valid in thread0, so shared memory is used for broadcast. + __syncthreads(); + error = error_reduced; + } + // If any thread has an error, skip the rest of the string and make this string as null + if (error) { + if (!d_chars && lane == 0) { + if (null_mask != nullptr) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + } + last_offset = 0; + d_offsets[istring] = 0; + } + if constexpr (!is_warp) { __syncthreads(); } + break; // gride-stride return; + } + + // Skipping non-copied escaped characters + bool skip = !is_within_bounds; // false; + // skip \ for \" \\ \/ \b \f \n \r \t \uXXXX + skip |= is_escaping_backslash; + if (is_within_bounds) { + // skip X for each X in \uXXXX + skip |= + char_index >= 2 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u'; + skip |= + char_index >= 3 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u'; + skip |= + char_index >= 4 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u'; + skip |= + char_index >= 5 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u'; + } + int this_num_out = 0; + cudf::char_utf8 write_char{}; + + if (!skip) { + // 1. Unescaped character + if (!is_prev_escaping_backslash) { + this_num_out = 1; + // writes char directly for non-unicode + } else { + // 2. Escaped character + if (escaped_char != UNICODE_SEQ) { + this_num_out = 1; + // writes char directly for non-unicode + } else { + // 3. Unicode + // UTF8 \uXXXX + auto hex_val = parse_unicode_hex(in_begin + char_index + 1); + auto hex_low_val = 0; + // UTF16 \uXXXX\uXXXX + // Note: no need for scanned_backslash below because we already know that + // only '\u' check is enough. + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + (in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) < + in_end && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') { + hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); + } + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + write_char = strings::detail::codepoint_to_utf8(unicode_code_point); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } else { + // if hex_val is high surrogate, ideally it should be parsing failure. + // but skipping it as other parsers do this too. + if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + // Ideally this should be skipped if previous char is high surrogate. + skip = true; + this_num_out = 0; + write_char = 0; + } else { + // if UTF8 + write_char = strings::detail::codepoint_to_utf8(hex_val); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } + } + } + } + } // !skip end. + { + // compute offset to write output for each thread + size_type offset; + if constexpr (is_warp) { + using OffsetScan = cub::WarpScan; + __shared__ typename OffsetScan::TempStorage temp_storage[num_warps]; + OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); + } else { + using OffsetScan = cub::BlockScan; + __shared__ typename OffsetScan::TempStorage temp_storage; + OffsetScan(temp_storage).ExclusiveSum(this_num_out, offset); + __syncthreads(); + } + offset += last_offset; + // Write output + if (d_chars && !skip) { + auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ; + if (is_not_unicode) { + *(d_buffer + offset) = (!is_prev_escaping_backslash) ? c : escaped_char; + } else { + strings::detail::from_char_utf8(write_char, d_buffer + offset); + } + } + offset += this_num_out; + if constexpr (is_warp) { + last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); + } else { + __syncthreads(); + if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset; + __syncthreads(); + } + } + } // char for-loop + if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } + } // grid-stride for-loop +} + +template +struct string_parse { + str_tuple_it str_tuples; + bitmask_type* null_mask; + size_type* null_count_data; + cudf::io::parse_options_view const options; + size_type* d_offsets{}; + char* d_chars{}; + + __device__ void operator()(size_type idx) + { + if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const in_begin = str_tuples[idx].first; + auto const in_end = in_begin + str_tuples[idx].second; + auto const num_in_chars = str_tuples[idx].second; + + if (num_in_chars > SINGLE_THREAD_THRESHOLD) return; + + // Check if the value corresponds to the null literal + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[idx] = 0; + return; + } + } + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { + if (null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + } + if (!d_chars) d_offsets[idx] = 0; + } else { + if (!d_chars) d_offsets[idx] = str_process_info.bytes; + } + } +}; + +template +struct to_string_view_pair { + SymbolT const* data; + to_string_view_pair(SymbolT const* _data) : data(_data) {} + __device__ auto operator()(thrust::tuple ip) + { + return thrust::pair{data + thrust::get<0>(ip), + static_cast(thrust::get<1>(ip))}; + } +}; + +template +static std::unique_ptr parse_string(string_view_pair_it str_tuples, + size_type col_size, + rmm::device_buffer&& null_mask, + rmm::device_scalar& d_null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // CUDF_FUNC_RANGE(); + + auto const max_length = thrust::transform_reduce( + rmm::exec_policy(stream), + str_tuples, + str_tuples + col_size, + [] __device__(auto t) { return t.second; }, + size_type{0}, + thrust::maximum{}); + + auto offsets = cudf::make_numeric_column( + data_type{type_to_id()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + auto null_count_data = d_null_count.data(); + + auto single_thread_fn = string_parse{ + str_tuples, static_cast(null_mask.data()), null_count_data, options, d_offsets}; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + constexpr auto warps_per_block = 8; + constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; + auto num_blocks = cudf::util::div_rounding_up_safe(col_size, warps_per_block); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + + // TODO run these independent kernels in parallel streams. + if (max_length > SINGLE_THREAD_THRESHOLD) { + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + + if (max_length > WARP_THRESHOLD) { + // for strings longer than WARP_THRESHOLD, 1 block per string + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", + std::overflow_error); + + // CHARS column + std::unique_ptr chars = + strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); + auto d_chars = chars->mutable_view().data(); + + single_thread_fn.d_chars = d_chars; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + if (max_length > SINGLE_THREAD_THRESHOLD) { + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + if (max_length > WARP_THRESHOLD) { + str_counter.set_value(0, stream); + // for strings longer than WARP_THRESHOLD, 1 block per string + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); +} + +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + if (col_size == 0) { return make_empty_column(col_type); } + auto d_null_count = rmm::device_scalar(null_count, stream); + auto null_count_data = d_null_count.data(); + if (null_mask.is_empty()) { + null_mask = cudf::detail::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr); + } + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data}); + + if (col_type == cudf::data_type{cudf::type_id::STRING}) { + return parse_string(str_tuples, + col_size, + std::forward(null_mask), + d_null_count, + options, + stream, + mr); + } + + auto out_col = + make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); + auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); + + // use `ConvertFunctor` to convert non-string values + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( + size_type row) { + if (col.is_null(row)) { return; } + auto const in = str_tuples[row]; + + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); + + if (is_null_literal) { + col.set_null(row); + atomicAdd(null_count_data, 1); + return; + } + + // If this is a string value, remove quotes + auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); + + auto const is_parsed = cudf::type_dispatcher(col_type, + ConvertFunctor{}, + in_begin, + in_end, + col.data(), + row, + col_type, + options, + false); + if (not is_parsed) { + col.set_null(row); + atomicAdd(null_count_data, 1); + } + }); + + out_col->set_null_count(d_null_count.value(stream)); + + return out_col; +} + +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 5c3af588411..43d62fcd513 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,6 +116,28 @@ struct parse_options { } }; +/** + * @brief Returns the escaped characters for a given character. + * + * @param escaped_char The character to escape. + * @return The escaped characters for a given character. + */ +__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return {'\\', '"'}; + case '\\': return {'\\', '\\'}; + case '/': return {'\\', '/'}; + case '\b': return {'\\', 'b'}; + case '\f': return {'\\', 'f'}; + case '\n': return {'\\', 'n'}; + case '\r': return {'\\', 'r'}; + case '\t': return {'\\', 't'}; + // case 'u': return UNICODE_SEQ; + default: return {'\0', escaped_char}; + } +} + /** * @brief Returns the numeric value of an ASCII/UTF-8 character. * Handles hexadecimal digits, both uppercase and lowercase diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp new file mode 100644 index 00000000000..12fc0a5b2e7 --- /dev/null +++ b/cpp/src/io/utilities/string_parsing.hpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace cudf::io { +namespace detail { + +/** + * @brief Infers data type for a given JSON string input `data`. + * + * @throw cudf::logic_error if input size is 0 + * @throw cudf::logic_error if date time is not inferred as string + * @throw cudf::logic_error if data type inference failed + * + * @param options View of inference options + * @param data JSON string input + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The inferred data type + */ +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream); +} // namespace detail + +namespace json::detail { + +/** + * @brief Parses the data from an iterator of string views, casting it to the given target data type + * + * @param data string input base pointer + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param col_size The total number of items of this column + * @param col_type The column's target data type + * @param null_mask A null mask that renders certain items from the input invalid + * @param options Settings for controlling the processing behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr The resource to be used for device memory allocation + * @return The column that contains the parsed data + */ +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); +} // namespace json::detail +} // namespace cudf::io diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cu similarity index 84% rename from cpp/src/io/utilities/type_inference.cuh rename to cpp/src/io/utilities/type_inference.cu index a9ccc80ca33..79a5c8f1c4c 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,23 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once #include -#include +#include #include #include -#include #include -#include -#include #include -#include -#include - #include #include @@ -114,14 +107,14 @@ __device__ __inline__ bool is_like_float(std::size_t len, * * @param[in] options View of inference options * @param[in] data JSON string input - * @param[in] column_strings_begin The beginning of an offset-length tuple sequence + * @param[in] offset_length_begin The beginning of an offset-length tuple sequence * @param[in] size Size of the string input * @param[out] column_info Histogram of column type counters */ template __global__ void infer_column_type_kernel(OptionsView options, device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t size, cudf::io::column_type_histogram* column_info) { @@ -129,8 +122,8 @@ __global__ void infer_column_type_kernel(OptionsView options, for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { - auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); - auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); + auto const field_offset = thrust::get<0>(*(offset_length_begin + idx)); + auto const field_len = thrust::get<1>(*(offset_length_begin + idx)); auto const field_begin = data.begin() + field_offset; if (cudf::detail::serialized_trie_contains( @@ -234,7 +227,7 @@ __global__ void infer_column_type_kernel(OptionsView options, * * @param options View of inference options * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence + * @param offset_length_begin The beginning of an offset-length tuple sequence * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches * @return A histogram containing column-specific type counters @@ -242,7 +235,7 @@ __global__ void infer_column_type_kernel(OptionsView options, template cudf::io::column_type_histogram infer_column_type(OptionsView const& options, cudf::device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t const size, rmm::cuda_stream_view stream) { @@ -254,40 +247,22 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); infer_column_type_kernel<<>>( - options, data, column_strings_begin, size, d_column_info.data()); + options, data, offset_length_begin, size, d_column_info.data()); return d_column_info.value(stream); } -/** - * @brief Infers data type for a given JSON string input `data`. - * - * @throw cudf::logic_error if input size is 0 - * @throw cudf::logic_error if date time is not inferred as string - * @throw cudf::logic_error if data type inference failed - * - * @tparam OptionsView Type of inference options view - * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to - * `thrust::tuple` - * - * @param options View of inference options - * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence - * @param size Size of the string input - * @param stream CUDA stream used for device memory operations and kernel launches - * @return The inferred data type - */ -template -cudf::data_type infer_data_type(OptionsView const& options, - device_span data, - ColumnStringIter column_strings_begin, - std::size_t const size, - rmm::cuda_stream_view stream) +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); - auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream); + auto const h_column_info = infer_column_type(options, data, offset_length_begin, size, stream); auto get_type_id = [&](auto const& cinfo) { auto int_count_total = diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 4c1b1ed98b1..e96505e5ed6 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -17,13 +17,12 @@ #include #include +#include +#include #include #include #include -#include -#include - #include #include diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index f8e7b4c6126..40a14d805e1 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,12 +36,12 @@ namespace cudf { namespace lists { namespace detail { /** - * @brief Returns a numeric column containing lengths of each element. + * @brief Returns a numeric column containing lengths of each element * - * @param input Input lists column. - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param input Input lists column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New INT32 column with lengths. + * @return New size_type column with lengths */ std::unique_ptr count_elements(lists_column_view const& input, rmm::cuda_stream_view stream, @@ -52,7 +52,7 @@ std::unique_ptr count_elements(lists_column_view const& input, // create output column auto output = make_fixed_width_column(data_type{type_to_id()}, input.size(), - copy_bitmask(input.parent()), + cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr); diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu index 260636a61cf..49054ebb046 100644 --- a/cpp/src/lists/segmented_sort.cu +++ b/cpp/src/lists/segmented_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,13 +70,13 @@ std::unique_ptr sort_lists(lists_column_view const& input, auto output_offset = build_output_offsets(input, stream, mr); auto const child = input.get_sliced_child(stream); - auto const sorted_child_table = segmented_sort_by_key(table_view{{child}}, - table_view{{child}}, - output_offset->view(), - {column_order}, - {null_precedence}, - stream, - mr); + auto const sorted_child_table = cudf::detail::segmented_sort_by_key(table_view{{child}}, + table_view{{child}}, + output_offset->view(), + {column_order}, + {null_precedence}, + stream, + mr); return make_lists_column(input.size(), std::move(output_offset), @@ -98,13 +98,13 @@ std::unique_ptr stable_sort_lists(lists_column_view const& input, auto output_offset = build_output_offsets(input, stream, mr); auto const child = input.get_sliced_child(stream); - auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}}, - table_view{{child}}, - output_offset->view(), - {column_order}, - {null_precedence}, - stream, - mr); + auto const sorted_child_table = cudf::detail::stable_segmented_sort_by_key(table_view{{child}}, + table_view{{child}}, + output_offset->view(), + {column_order}, + {null_precedence}, + stream, + mr); return make_lists_column(input.size(), std::move(output_offset), diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 2b48aed2d29..950cb484ddf 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -163,7 +163,9 @@ std::enable_if_t(), std::unique_ptr> clamp auto output = detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr); // mask will not change - if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); } + if (input.nullable()) { + output->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + } auto output_device_view = cudf::mutable_column_device_view::create(output->mutable_view(), stream); diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 3b6d53f43c4..0648ef3d30f 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -70,7 +70,22 @@ namespace cudf { namespace detail { -namespace { // anonymous +/// Helper function to materialize preceding/following offsets. +template +std::unique_ptr expand_to_column(Calculator const& calc, + size_type const& num_rows, + rmm::cuda_stream_view stream) +{ + auto window_column = cudf::make_numeric_column( + cudf::data_type{type_to_id()}, num_rows, cudf::mask_state::UNALLOCATED, stream); + + auto begin = cudf::detail::make_counting_transform_iterator(0, calc); + + thrust::copy_n( + rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data()); + + return window_column; +} /** * @brief Operator for applying a generic (non-specialized) rolling aggregation on a single window. @@ -91,14 +106,14 @@ struct DeviceRolling { // operations we do support template - DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) + explicit DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) : min_periods(_min_periods) { } // operations we don't support template - DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) + explicit DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) : min_periods(_min_periods) { CUDF_FAIL("Invalid aggregation/type pair"); @@ -111,7 +126,7 @@ struct DeviceRolling { mutable_column_device_view& output, size_type start_index, size_type end_index, - size_type current_index) + size_type current_index) const { using AggOp = typename corresponding_operator::type; AggOp agg_op; @@ -144,7 +159,7 @@ struct DeviceRolling { template struct DeviceRollingArgMinMaxBase { size_type min_periods; - DeviceRollingArgMinMaxBase(size_type _min_periods) : min_periods(_min_periods) {} + explicit DeviceRollingArgMinMaxBase(size_type _min_periods) : min_periods(_min_periods) {} static constexpr bool is_supported() { @@ -162,7 +177,7 @@ struct DeviceRollingArgMinMaxBase { */ template struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase { - DeviceRollingArgMinMaxString(size_type _min_periods) + explicit DeviceRollingArgMinMaxString(size_type _min_periods) : DeviceRollingArgMinMaxBase(_min_periods) { } @@ -461,8 +476,8 @@ struct agg_specific_empty_output { } }; -std::unique_ptr empty_output_for_rolling_aggregation(column_view const& input, - rolling_aggregation const& agg) +static std::unique_ptr empty_output_for_rolling_aggregation(column_view const& input, + rolling_aggregation const& agg) { // TODO: // Ideally, for UDF aggregations, the returned column would match @@ -1215,8 +1230,6 @@ struct dispatch_rolling { } }; -} // namespace - // Applies a user-defined rolling window function to the values in a column. template std::unique_ptr rolling_window_udf(column_view const& input, diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu index fb7b1b5f590..e951db955e5 100644 --- a/cpp/src/rolling/detail/rolling_fixed_window.cu +++ b/cpp/src/rolling/detail/rolling_fixed_window.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,9 @@ #include #include +#include + #include -#include namespace cudf::detail { @@ -43,6 +44,9 @@ std::unique_ptr rolling_window(column_view const& input, CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()), "Defaults column must be either empty or have as many rows as the input column."); + CUDF_EXPECTS(-(preceding_window - 1) <= following_window, + "Preceding window bounds must precede the following window bounds."); + if (agg.kind == aggregation::CUDA || agg.kind == aggregation::PTX) { // TODO: In future, might need to clamp preceding/following to column boundaries. return cudf::detail::rolling_window_udf(input, @@ -58,18 +62,22 @@ std::unique_ptr rolling_window(column_view const& input, // Clamp preceding/following to column boundaries. // E.g. If preceding_window == 2, then for a column of 5 elements, preceding_window will be: // [1, 2, 2, 2, 1] - auto const preceding_window_begin = cudf::detail::make_counting_transform_iterator( - 0, - [preceding_window] __device__(size_type i) { return thrust::min(i + 1, preceding_window); }); - auto const following_window_begin = cudf::detail::make_counting_transform_iterator( - 0, [col_size = input.size(), following_window] __device__(size_type i) { - return thrust::min(col_size - i - 1, following_window); - }); + auto const preceding_calc = [preceding_window] __device__(size_type i) { + return thrust::min(i + 1, preceding_window); + }; + + auto const following_calc = [col_size = input.size(), + following_window] __device__(size_type i) { + return thrust::min(col_size - i - 1, following_window); + }; + + auto const preceding_column = expand_to_column(preceding_calc, input.size(), stream); + auto const following_column = expand_to_column(following_calc, input.size(), stream); return cudf::detail::rolling_window(input, default_outputs, - preceding_window_begin, - following_window_begin, + preceding_column->view().begin(), + following_column->view().begin(), min_periods, agg, stream, diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index ca5c04d1c4f..7ac784bef43 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -30,7 +30,6 @@ #include #include -#include #include #include #include @@ -94,6 +93,109 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, namespace detail { +/// Preceding window calculation functor. +template +struct row_based_preceding_calc { + cudf::size_type const* _group_offsets_begin; + cudf::size_type const* _group_labels_begin; + cudf::size_type const _preceding_window; + + row_based_preceding_calc(rmm::device_uvector const& group_offsets, + rmm::device_uvector const& group_labels, + cudf::size_type const& preceding_window) + : _group_offsets_begin(group_offsets.data()), + _group_labels_begin(group_labels.data()), + _preceding_window(preceding_window) + { + } + + __device__ cudf::size_type operator()(cudf::size_type const& idx) const + { + auto group_label = _group_labels_begin[idx]; + if constexpr (preceding_less_than_1) { // where 1 indicates only the current row. + auto group_end = _group_offsets_begin[group_label + 1]; + return thrust::maximum{}(_preceding_window, -(group_end - 1 - idx)); + } else { + auto group_start = _group_offsets_begin[group_label]; + return thrust::minimum{}(_preceding_window, + idx - group_start + 1); // Preceding includes current row. + } + } +}; + +/// Helper to materialize preceding-window column, corrected to respect group boundaries. +/// E.g. If preceding window == 5, then, +/// 1. For the first row in the group, the preceding is set to 1, +/// 2. For the next row in the group, preceding is set to 2, etc. +std::unique_ptr make_preceding_column( + rmm::device_uvector const& group_offsets, + rmm::device_uvector const& group_labels, + cudf::size_type const& preceding_window, + cudf::size_type const& num_rows, + rmm::cuda_stream_view stream) +{ + if (preceding_window < 1) { + auto const calc = row_based_preceding_calc(group_offsets, group_labels, preceding_window); + return cudf::detail::expand_to_column(calc, num_rows, stream); + } else { + auto const calc = + row_based_preceding_calc(group_offsets, group_labels, preceding_window); + return cudf::detail::expand_to_column(calc, num_rows, stream); + } +} + +/// Following window calculation functor. +template +struct row_based_following_calc { + cudf::size_type const* _group_offsets_begin; + cudf::size_type const* _group_labels_begin; + cudf::size_type const _following_window; + + row_based_following_calc(rmm::device_uvector const& group_offsets, + rmm::device_uvector const& group_labels, + cudf::size_type const& following_window) + : _group_offsets_begin(group_offsets.data()), + _group_labels_begin(group_labels.data()), + _following_window(following_window) + { + } + + __device__ cudf::size_type operator()(cudf::size_type const& idx) const + { + auto group_label = _group_labels_begin[idx]; + if constexpr (following_less_than_0) { + auto group_start = _group_offsets_begin[group_label]; + return thrust::maximum{}(_following_window, -(idx - group_start) - 1); + } else { + auto group_end = + _group_offsets_begin[group_label + 1]; // Cannot fall off the end, since offsets + // is capped with `input.size()`. + return thrust::minimum{}(_following_window, (group_end - 1) - idx); + } + } +}; + +/// Helper to materialize following-window column, corrected to respect group boundaries. +/// i.e. If following window == 5, then: +/// 1. For the last row in the group, the following is set to 0. +/// 2. For the second last row in the group, following is set to 1, etc. +std::unique_ptr make_following_column( + rmm::device_uvector const& group_offsets, + rmm::device_uvector const& group_labels, + cudf::size_type const& following_window, + cudf::size_type const& num_rows, + rmm::cuda_stream_view stream) +{ + if (following_window < 0) { + auto const calc = row_based_following_calc(group_offsets, group_labels, following_window); + return cudf::detail::expand_to_column(calc, num_rows, stream); + } else { + auto const calc = + row_based_following_calc(group_offsets, group_labels, following_window); + return cudf::detail::expand_to_column(calc, num_rows, stream); + } +} + std::unique_ptr grouped_rolling_window(table_view const& group_keys, column_view const& input, column_view const& default_outputs, @@ -111,7 +213,7 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()), "Size mismatch between group_keys and input vector."); - CUDF_EXPECTS((min_periods > 0), "min_periods must be positive"); + CUDF_EXPECTS((min_periods >= 0), "min_periods must be non-negative"); CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()), "Defaults column must be either empty or have as many rows as the input column."); @@ -127,6 +229,9 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, auto const preceding_window = preceding_window_bounds.value(); auto const following_window = following_window_bounds.value(); + CUDF_EXPECTS(-(preceding_window - 1) <= following_window, + "Preceding window bounds must precede the following window bounds."); + if (group_keys.num_columns() == 0) { // No Groupby columns specified. Treat as one big group. return rolling_window( @@ -157,24 +262,6 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, group_offsets.element(group_offsets.size() - 1, stream) == input.size() && "Must have at least one group."); - auto preceding_calculator = [d_group_offsets = group_offsets.data(), - d_group_labels = group_labels.data(), - preceding_window] __device__(size_type idx) { - auto group_label = d_group_labels[idx]; - auto group_start = d_group_offsets[group_label]; - return thrust::minimum{}(preceding_window, - idx - group_start + 1); // Preceding includes current row. - }; - - auto following_calculator = [d_group_offsets = group_offsets.data(), - d_group_labels = group_labels.data(), - following_window] __device__(size_type idx) { - auto group_label = d_group_labels[idx]; - auto group_end = d_group_offsets[group_label + 1]; // Cannot fall off the end, since offsets - // is capped with `input.size()`. - return thrust::minimum{}(following_window, (group_end - 1) - idx); - }; - if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) { cudf::detail::preceding_window_wrapper grouped_preceding_window{ group_offsets.data(), group_labels.data(), preceding_window}; @@ -192,15 +279,18 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, stream, mr); } else { - return cudf::detail::rolling_window( - input, - default_outputs, - cudf::detail::make_counting_transform_iterator(0, preceding_calculator), - cudf::detail::make_counting_transform_iterator(0, following_calculator), - min_periods, - aggr, - stream, - mr); + auto const preceding_column = + make_preceding_column(group_offsets, group_labels, preceding_window, input.size(), stream); + auto const following_column = + make_following_column(group_offsets, group_labels, following_window, input.size(), stream); + return cudf::detail::rolling_window(input, + default_outputs, + preceding_column->view().begin(), + following_column->view().begin(), + min_periods, + aggr, + stream, + mr); } } @@ -267,6 +357,16 @@ template struct device_value_accessor { column_device_view const col; ///< column view of column in device + /// Checks that the type used to access device values matches the rep-type + /// of the order-by column. + struct is_correct_range_rep { + template /// Order-by type. + constexpr bool operator()() const + { + return std::is_same_v>; + } + }; + /** * @brief constructor * @@ -274,8 +374,11 @@ struct device_value_accessor { */ explicit __device__ device_value_accessor(column_device_view const& col_) : col{col_} { - cudf_assert(type_id_matches_device_storage_type(col.type().id()) && - "the data type mismatch"); + // For non-timestamp types, T must match the order-by column's type. + // For timestamp types, T must match the range rep type for the order-by column. + cudf_assert((type_id_matches_device_storage_type(col.type().id()) or + cudf::type_dispatcher(col.type(), is_correct_range_rep{})) && + "data type mismatch when accessing the order-by column"); } /** @@ -321,22 +424,6 @@ std::tuple get_null_bounds_for_orderby_column( : std::make_tuple(num_rows - num_nulls, num_rows); } -template -std::unique_ptr expand_to_column(Calculator const& calc, - size_type const& num_rows, - rmm::cuda_stream_view stream) -{ - auto window_column = cudf::make_numeric_column( - cudf::data_type{type_to_id()}, num_rows, cudf::mask_state::UNALLOCATED, stream); - - auto begin = cudf::detail::make_counting_transform_iterator(0, calc); - - thrust::copy_n( - rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data()); - - return window_column; -} - /// Range window computation, with /// 1. no grouping keys specified /// 2. rows in ASCENDING order. @@ -390,7 +477,8 @@ std::unique_ptr range_window_ASC(column_view const& input, 1; // Add 1, for `preceding` to account for current row. }; - auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream); + auto const preceding_column = + cudf::detail::expand_to_column(preceding_calculator, input.size(), stream); auto const following_calculator = [nulls_begin_idx = h_nulls_begin_idx, @@ -425,7 +513,8 @@ std::unique_ptr range_window_ASC(column_view const& input, 1; }; - auto const following_column = expand_to_column(following_calculator, input.size(), stream); + auto const following_column = + cudf::detail::expand_to_column(following_calculator, input.size(), stream); return cudf::detail::rolling_window( input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr); @@ -570,7 +659,8 @@ std::unique_ptr range_window_ASC(column_view const& input, 1; // Add 1, for `preceding` to account for current row. }; - auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream); + auto const preceding_column = + cudf::detail::expand_to_column(preceding_calculator, input.size(), stream); auto const following_calculator = [d_group_offsets = group_offsets.data(), @@ -616,7 +706,8 @@ std::unique_ptr range_window_ASC(column_view const& input, 1; }; - auto const following_column = expand_to_column(following_calculator, input.size(), stream); + auto const following_column = + cudf::detail::expand_to_column(following_calculator, input.size(), stream); return cudf::detail::rolling_window( input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr); @@ -675,7 +766,8 @@ std::unique_ptr range_window_DESC(column_view const& input, 1; // Add 1, for `preceding` to account for current row. }; - auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream); + auto const preceding_column = + cudf::detail::expand_to_column(preceding_calculator, input.size(), stream); auto const following_calculator = [nulls_begin_idx = h_nulls_begin_idx, @@ -710,7 +802,8 @@ std::unique_ptr range_window_DESC(column_view const& input, 1; }; - auto const following_column = expand_to_column(following_calculator, input.size(), stream); + auto const following_column = + cudf::detail::expand_to_column(following_calculator, input.size(), stream); return cudf::detail::rolling_window( input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr); @@ -774,7 +867,8 @@ std::unique_ptr range_window_DESC(column_view const& input, 1; // Add 1, for `preceding` to account for current row. }; - auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream); + auto const preceding_column = + cudf::detail::expand_to_column(preceding_calculator, input.size(), stream); auto const following_calculator = [d_group_offsets = group_offsets.data(), @@ -817,7 +911,8 @@ std::unique_ptr range_window_DESC(column_view const& input, 1; }; - auto const following_column = expand_to_column(following_calculator, input.size(), stream); + auto const following_column = + cudf::detail::expand_to_column(following_calculator, input.size(), stream); if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) { CUDF_FAIL("Ranged rolling window does NOT (yet) support UDF."); diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index d699d7bea85..5c78cc4382d 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +20,6 @@ #include #include -#include - namespace cudf { // Applies a fixed-size rolling window function to the values in a column, with default output diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index 25c594e9e74..39476a2f534 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -73,7 +73,8 @@ bool is_sorted(cudf::table_view const& in, bool is_sorted(cudf::table_view const& in, std::vector const& column_order, - std::vector const& null_precedence) + std::vector const& null_precedence, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); if (in.num_columns() == 0 || in.num_rows() == 0) { return true; } @@ -89,7 +90,7 @@ bool is_sorted(cudf::table_view const& in, "Number of columns in the table doesn't match the vector null_precedence's size .\n"); } - return detail::is_sorted(in, column_order, null_precedence, cudf::get_default_stream()); + return detail::is_sorted(in, column_order, null_precedence, stream); } } // namespace cudf diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index fd65e38d467..3ead8cfcbaa 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -366,16 +366,11 @@ std::unique_ptr rank(column_view const& input, null_policy null_handling, null_order null_precedence, bool percentage, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rank(input, - method, - column_order, - null_handling, - null_precedence, - percentage, - cudf::get_default_stream(), - mr); + return detail::rank( + input, method, column_order, null_handling, null_precedence, percentage, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu index 38d008c120c..d9457341bd2 100644 --- a/cpp/src/sort/segmented_sort.cu +++ b/cpp/src/sort/segmented_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,11 +81,12 @@ std::unique_ptr segmented_sorted_order(table_view const& keys, column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::segmented_sorted_order( - keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + keys, segment_offsets, column_order, null_precedence, stream, mr); } std::unique_ptr
segmented_sort_by_key(table_view const& values, @@ -93,11 +94,12 @@ std::unique_ptr
segmented_sort_by_key(table_view const& values, column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::segmented_sort_by_key( - values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + values, keys, segment_offsets, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh index 37664f33762..5d11bf055f1 100644 --- a/cpp/src/sort/segmented_sort_impl.cuh +++ b/cpp/src/sort/segmented_sort_impl.cuh @@ -166,7 +166,7 @@ std::unique_ptr fast_segmented_sorted_order(column_view const& input, // Unfortunately, CUB's segmented sort functions cannot accept iterators. // We have to build a pre-filled sequence of indices as input. auto sorted_indices = - cudf::detail::sequence(input.size(), numeric_scalar{0}, stream, mr); + cudf::detail::sequence(input.size(), numeric_scalar{0, true, stream}, stream, mr); auto indices_view = sorted_indices->mutable_view(); cudf::type_dispatcher(input.type(), diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index 25b95af4f83..46edae798d4 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -109,30 +109,32 @@ std::unique_ptr
sort(table_view const& input, std::unique_ptr sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sorted_order(input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sorted_order(input, column_order, null_precedence, stream, mr); } std::unique_ptr
sort(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort(input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sort(input, column_order, null_precedence, stream, mr); } std::unique_ptr
sort_by_key(table_view const& values, table_view const& keys, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort_by_key( - values, keys, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu index 40df1b50279..4725d65e05d 100644 --- a/cpp/src/sort/stable_segmented_sort.cu +++ b/cpp/src/sort/stable_segmented_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,11 +55,12 @@ std::unique_ptr stable_segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::stable_segmented_sorted_order( - keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + keys, segment_offsets, column_order, null_precedence, stream, mr); } std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, @@ -67,11 +68,12 @@ std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::stable_segmented_sort_by_key( - values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + values, keys, segment_offsets, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu index 6f5678c4168..cf602dcf1a9 100644 --- a/cpp/src/sort/stable_sort.cu +++ b/cpp/src/sort/stable_sort.cu @@ -62,22 +62,22 @@ std::unique_ptr
stable_sort_by_key(table_view const& values, std::unique_ptr stable_sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::stable_sorted_order( - input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr); } std::unique_ptr
stable_sort_by_key(table_view const& values, table_view const& keys, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::stable_sort_by_key( - values, keys, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp index 58d958d2ff4..18c531e3e69 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.hpp +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -15,12 +15,11 @@ */ #pragma once +#include +#include #include #include -#include -#include - #include #include diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 3de9dd34d83..1299e552565 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -305,20 +305,22 @@ std::unique_ptr find(strings_column_view const& strings, string_scalar const& target, size_type start, size_type stop, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find(strings, target, start, stop, cudf::get_default_stream(), mr); + return detail::find(strings, target, start, stop, stream, mr); } std::unique_ptr rfind(strings_column_view const& strings, string_scalar const& target, size_type start, size_type stop, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr); + return detail::rfind(strings, target, start, stop, stream, mr); } std::unique_ptr find(strings_column_view const& input, @@ -618,50 +620,56 @@ std::unique_ptr ends_with(strings_column_view const& strings, std::unique_ptr contains(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, target, cudf::get_default_stream(), mr); + return detail::contains(strings, target, stream, mr); } std::unique_ptr contains(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, targets, cudf::get_default_stream(), mr); + return detail::contains(strings, targets, stream, mr); } std::unique_ptr starts_with(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, target, cudf::get_default_stream(), mr); + return detail::starts_with(strings, target, stream, mr); } std::unique_ptr starts_with(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, targets, cudf::get_default_stream(), mr); + return detail::starts_with(strings, targets, stream, mr); } std::unique_ptr ends_with(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, target, cudf::get_default_stream(), mr); + return detail::ends_with(strings, target, stream, mr); } std::unique_ptr ends_with(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, targets, cudf::get_default_stream(), mr); + return detail::ends_with(strings, targets, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index 4a823ad1dcb..fcaec835f4d 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -70,8 +70,8 @@ std::unique_ptr find_multiple(strings_column_view const& input, results->set_null_count(0); auto offsets = cudf::detail::sequence(strings_count + 1, - numeric_scalar(0), - numeric_scalar(targets_count), + numeric_scalar(0, true, stream), + numeric_scalar(targets_count, true, stream), stream, mr); return make_lists_column(strings_count, @@ -88,10 +88,11 @@ std::unique_ptr find_multiple(strings_column_view const& input, // external API std::unique_ptr find_multiple(strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find_multiple(input, targets, cudf::get_default_stream(), mr); + return detail::find_multiple(input, targets, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 2df64c6a0a7..acea4ff1c51 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -134,10 +134,11 @@ std::unique_ptr findall(strings_column_view const& input, std::unique_ptr findall(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::findall(input, prog, cudf::get_default_stream(), mr); + return detail::findall(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 938fd45246d..5f2f4d021a4 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -150,10 +150,11 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s std::unique_ptr generate_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr); + return detail::generate_ngrams(strings, ngrams, separator, stream, mr); } namespace detail { @@ -317,18 +318,20 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::generate_character_ngrams(strings, ngrams, stream, mr); } std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::hash_character_ngrams(strings, ngrams, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 5b55745c2c7..95324847ea0 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -107,7 +107,7 @@ rmm::device_uvector compute_unique_counts(cudf::column_view con * * This is called with a warp per row */ -struct sorted_interset_fn { +struct sorted_intersect_fn { cudf::column_device_view const d_input1; cudf::column_device_view const d_input2; cudf::size_type* d_results; @@ -151,7 +151,7 @@ rmm::device_uvector compute_intersect_counts(cudf::column_view auto const d_input1 = cudf::column_device_view::create(input1, stream); auto const d_input2 = cudf::column_device_view::create(input2, stream); auto d_results = rmm::device_uvector(input1.size(), stream); - sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()}; + sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()}; thrust::for_each_n(rmm::exec_policy(stream), thrust::counting_iterator(0), input1.size() * cudf::detail::warp_size, diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index fd1cbf99221..73d85513e95 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -265,11 +265,11 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s cudf::size_type ngrams, cudf::string_scalar const& delimiter, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ngrams_tokenize( - strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr); + return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 83aa22aaae9..2fa879ea734 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -18,10 +18,9 @@ #include -#include - #include #include +#include #include #include diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4923ef5c903..68ff6c54c99 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -453,6 +453,7 @@ ConfigureTest( rolling/grouped_rolling_test.cpp rolling/lead_lag_test.cpp rolling/nth_element_test.cpp + rolling/offset_row_window_test.cpp rolling/range_comparator_test.cu rolling/range_rolling_window_test.cpp rolling/range_window_bounds_test.cpp @@ -620,14 +621,21 @@ ConfigureTest( STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu ) -ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) +ConfigureTest( + STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE + testing +) +ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 9a5cc3733af..a898106a5b2 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -456,3 +456,98 @@ INSTANTIATE_TEST_CASE_P(FromArrowTest, std::make_tuple(0, 0), std::make_tuple(0, 3000), std::make_tuple(10000, 10000))); + +template +struct FromArrowNumericScalarTest : public cudf::test::BaseFixture {}; + +using NumericTypesNotBool = + cudf::test::Concat; +TYPED_TEST_SUITE(FromArrowNumericScalarTest, NumericTypesNotBool); + +TYPED_TEST(FromArrowNumericScalarTest, Basic) +{ + TypeParam const value{42}; + auto const arrow_scalar = arrow::MakeScalar(value); + auto const cudf_scalar = cudf::from_arrow(*arrow_scalar); + auto const cudf_numeric_scalar = + dynamic_cast*>(cudf_scalar.get()); + if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); } + EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id())); + EXPECT_EQ(cudf_numeric_scalar->value(), value); +} + +struct FromArrowDecimalScalarTest : public cudf::test::BaseFixture {}; + +// Only testing Decimal128 because that's the only size cudf and arrow have in common. +TEST_F(FromArrowDecimalScalarTest, Basic) +{ + auto const value{42}; + auto const precision{8}; + auto const scale{4}; + auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale)); + auto cudf_scalar = cudf::from_arrow(arrow_scalar); + + // Arrow offers a minimum of 128 bits for the Decimal type. + auto const cudf_decimal_scalar = + dynamic_cast*>(cudf_scalar.get()); + EXPECT_EQ(cudf_decimal_scalar->type(), + cudf::data_type(cudf::type_to_id(), scale)); + EXPECT_EQ(cudf_decimal_scalar->value(), value); +} + +struct FromArrowStringScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowStringScalarTest, Basic) +{ + auto const value = std::string("hello world"); + auto const arrow_scalar = arrow::StringScalar(value); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto const cudf_string_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING)); + EXPECT_EQ(cudf_string_scalar->to_string(), value); +} + +struct FromArrowListScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowListScalarTest, Basic) +{ + std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector host_validity = {true, true, true, false, true, true, true}; + + arrow::Int64Builder builder; + auto const status = builder.AppendValues(host_values, host_validity); + auto const maybe_array = builder.Finish(); + auto const array = *maybe_array; + + auto const arrow_scalar = arrow::ListScalar(array); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto const cudf_list_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST)); + + cudf::test::fixed_width_column_wrapper const lhs( + host_values.begin(), host_values.end(), host_validity.begin()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lhs, cudf_list_scalar->view()); +} + +struct FromArrowStructScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowStructScalarTest, Basic) +{ + int64_t const value{42}; + auto const underlying_arrow_scalar = arrow::MakeScalar(value); + + auto const field = arrow::field("", underlying_arrow_scalar->type); + auto const arrow_type = arrow::struct_({field}); + auto const arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto const cudf_struct_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT)); + + cudf::test::fixed_width_column_wrapper const col({value}); + cudf::table_view const lhs({col}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view()); +} diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 97d80984272..6bb4cdfd747 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -578,4 +579,106 @@ INSTANTIATE_TEST_CASE_P(ToArrowTest, std::make_tuple(0, 0), std::make_tuple(0, 3000))); +template +struct ToArrowNumericScalarTest : public cudf::test::BaseFixture {}; + +using NumericTypesNotBool = + cudf::test::Concat; +TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool); + +TYPED_TEST(ToArrowNumericScalarTest, Basic) +{ + TypeParam const value{42}; + auto const cudf_scalar = cudf::make_fixed_width_scalar(value); + + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const ref_arrow_scalar = arrow::MakeScalar(value); + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {}; + +// Only testing Decimal128 because that's the only size cudf and arrow have in common. +TEST_F(ToArrowDecimalScalarTest, Basic) +{ + auto const value{42}; + auto const precision{18}; // cudf will convert to the widest-precision Arrow scalar of the type + int32_t const scale{4}; + + auto const cudf_scalar = + cudf::make_fixed_point_scalar(value, numeric::scale_type{scale}); + + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const maybe_ref_arrow_scalar = + arrow::MakeScalar(arrow::decimal128(precision, -scale), value); + if (!maybe_ref_arrow_scalar.ok()) { CUDF_FAIL("Failed to construct reference scalar"); } + auto const ref_arrow_scalar = *maybe_ref_arrow_scalar; + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowStringScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowStringScalarTest, Basic) +{ + std::string const value{"hello world"}; + auto const cudf_scalar = cudf::make_string_scalar(value); + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const ref_arrow_scalar = arrow::MakeScalar(value); + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowListScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowListScalarTest, Basic) +{ + std::vector const host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector const host_validity = {true, true, true, false, true, true, true}; + + cudf::test::fixed_width_column_wrapper const col( + host_values.begin(), host_values.end(), host_validity.begin()); + + auto const cudf_scalar = cudf::make_list_scalar(col); + + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + arrow::Int64Builder builder; + auto const status = builder.AppendValues(host_values, host_validity); + auto const maybe_array = builder.Finish(); + auto const array = *maybe_array; + + auto const ref_arrow_scalar = arrow::ListScalar(array); + + EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); +} + +struct ToArrowStructScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowStructScalarTest, Basic) +{ + int64_t const value{42}; + auto const field_name{"a"}; + + cudf::test::fixed_width_column_wrapper const col{value}; + cudf::table_view const tbl({col}); + auto const cudf_scalar = cudf::make_struct_scalar(tbl); + + cudf::column_metadata metadata{""}; + metadata.children_meta.emplace_back(field_name); + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const underlying_arrow_scalar = arrow::MakeScalar(value); + auto const field = arrow::field(field_name, underlying_arrow_scalar->type, false); + auto const arrow_type = arrow::struct_({field}); + auto const ref_arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + + EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 220f1a3391f..7c911ac2e04 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1370,6 +1371,124 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); } +TEST_F(JsonReaderTest, JsonLongString) +{ + // Unicode + // 0000-FFFF Basic Multilingual Plane + // 10000-10FFFF Supplementary Plane + cudf::test::strings_column_wrapper col1{ + { + "\"\\/\b\f\n\r\t", + "\"", + "\\", + "/", + "\b", + "\f\n", + "\r\t", + "$€", + "ராபிட்ஸ்", + "C𝞵𝓓𝒻", + "", // null + "", // null + "கார்த்தி", + "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF + "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF + "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF + "𰾑𱔈𲍉", // 30000-3FFFF + R"("$€ \u0024\u20ac \\u0024\\u20ac \\\u0024\\\u20ac \\\\u0024\\\\u20ac)", + R"( \\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\abcd)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + }, + cudf::test::iterators::nulls_at({10, 11})}; + + cudf::test::fixed_width_column_wrapper repeat_times{ + {1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3}, + cudf::test::iterators::no_nulls()}; + auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times); + auto col2 = d_col2->view(); + cudf::table_view const tbl_view{{col1, col2, repeat_times}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + + cudf::table_view const expected = tbl_view; + std::map types; + types["col1"] = data_type{type_id::STRING}; + types["col2"] = data_type{type_id::STRING}; + types["int16"] = data_type{type_id::INT16}; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}) + .lines(true) + .dtypes(types); + + // Read test data via nested JSON reader + auto const table = cudf::io::read_json(json_lines_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); +} + +TEST_F(JsonReaderTest, ErrorStrings) +{ + // cases of invalid escape characters, invalid unicode encodings. + // Error strings will decode to nulls + auto const buffer = std::string{R"( + {"col0": "\"\a"} + {"col0": "\u"} + {"col0": "\u0"} + {"col0": "\u0b"} + {"col0": "\u00b"} + {"col0": "\u00bz"} + {"col0": "\t34567890123456\t9012345678901\ug0bc"} + {"col0": "\t34567890123456\t90123456789012\u0hbc"} + {"col0": "\t34567890123456\t90123456789012\u00ic"} + {"col0": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbfகார்த்தி"} +)"}; + // Last one is not an error case, but shows that unicode in json is copied string column output. + + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .dtypes({data_type{cudf::type_id::STRING}}) + .lines(true) + .legacy(false); + + auto const result = cudf::io::read_json(in_opts); + auto const result_view = result.tbl->view().column(0); + + EXPECT_EQ(result.metadata.schema_info[0].name, "col0"); + EXPECT_EQ(result_view.null_count(), 9); + cudf::test::strings_column_wrapper expected{ + {"", + "", + "", + "", + "", + "", + "", + "", + "", + "கார்த்தி\xe0\xae\x95\xe0\xae\xbe\xe0\xae\xb0\xe0\xaf\x8d\xe0\xae\xa4\xe0\xaf\x8d\xe0\xae\xa4" + "\xe0\xae\xbf"}, + // unicode hex 0xe0 0xae 0x95 0xe0 0xae 0xbe 0xe0 0xae 0xb0 0xe0 0xaf 0x8d + // 0xe0 0xae 0xa4 0xe0 0xaf 0x8d 0xe0 0xae 0xa4 0xe0 0xae 0xbf + cudf::test::iterators::nulls_at({0, 1, 2, 3, 4, 5, 6, 7, 8})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected); +} + TEST_F(JsonReaderTest, TokenAllocation) { std::array const json_inputs{ diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 5c32131114d..9eb5e8f5230 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -21,15 +21,20 @@ #include #include +#include + #include #include #include -#include #include #include #include #include +#include + +#include +#include #include using namespace cudf::test::iterators; @@ -37,13 +42,27 @@ using namespace cudf::test::iterators; struct JSONTypeCastTest : public cudf::test::BaseFixture {}; namespace { -struct to_thrust_pair_fn { - __device__ thrust::pair operator()( - thrust::pair const& p) +struct offsets_to_length { + __device__ cudf::size_type operator()(thrust::tuple const& p) { - return {p.first.data(), p.first.size_bytes()}; + return thrust::get<1>(p) - thrust::get<0>(p); } }; + +/// Returns length of each string in the column +auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream) +{ + auto offsets_begin = column.offsets_begin(); + auto offsets_pair = + thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); + rmm::device_uvector svs_length(column.size(), stream); + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), + offsets_pair, + offsets_pair + column.size(), + svs_length.begin(), + offsets_to_length{}); + return svs_length; +} } // namespace auto default_json_options() @@ -67,26 +86,23 @@ TEST_F(JSONTypeCastTest, String) std::vector input_values{"this", "is", "null", "of", "", "strings", R"("null")"}; cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids); - auto d_column = cudf::column_device_view::create(input); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(input); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto str_col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); @@ -103,26 +119,23 @@ TEST_F(JSONTypeCastTest, Int) auto const type = cudf::data_type{cudf::type_id::INT64}; cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"}); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; @@ -146,26 +159,23 @@ TEST_F(JSONTypeCastTest, StringEscapes) R"("escape with nothing to escape \")", R"("\"\\\/\b\f\n\r\t")", }); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::strings_column_wrapper{ {"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"}, @@ -173,4 +183,71 @@ TEST_F(JSONTypeCastTest, StringEscapes) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } +TEST_F(JSONTypeCastTest, ErrorNulls) +{ + auto const stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + auto const type = cudf::data_type{cudf::type_id::STRING}; + + // error in decoding + std::vector input_values{R"("\"\a")", + R"("\u")", + R"("\u0")", + R"("\u0b")", + R"("\u00b")", + R"("\u00bz")", + R"("\t34567890123456\t9012345678901\ug0bc")", + R"("\t34567890123456\t90123456789012\u0hbc")", + R"("\t34567890123456\t90123456789012\u00ic")", + R"("\t34567890123456\t9012345678901\")", + R"("\t34567890123456\t90123456789012\")", + R"(null)"}; + // Note: without quotes are copied without decoding + cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end()); + + auto column = cudf::strings_column_view(input); + auto space_length = 128; + auto prepend_space = [&space_length](auto const& s) { + if (s[0] == '"') return "\"" + std::string(space_length, ' ') + std::string(s + 1); + return std::string(s); + }; + std::vector small_input; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(small_input), prepend_space); + cudf::test::strings_column_wrapper small_col(small_input.begin(), small_input.end()); + + std::vector large_input; + space_length = 128 * 128; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(large_input), prepend_space); + cudf::test::strings_column_wrapper large_col(large_input.begin(), large_input.end()); + + std::vector expected_values{"", "", "", "", "", "", "", "", "", "", "", ""}; + cudf::test::strings_column_wrapper expected( + expected_values.begin(), expected_values.end(), cudf::test::iterators::all_nulls()); + + // single threads, warp, block. + for (auto const& column : + {column, cudf::strings_column_view(small_col), cudf::strings_column_view(large_col)}) { + rmm::device_uvector svs_length = string_offset_to_length(column, stream); + + auto null_mask_it = no_nulls(); + auto null_mask = + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(str_col->view(), expected); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index b2eb1b94f9c..a14e7ecf5b3 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include #include -#include #include #include @@ -50,8 +50,8 @@ TEST_F(TypeInference, Basic) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 4, 7}; - auto const string_length = std::vector{2, 2, 1}; + auto const string_offset = std::vector{1, 4, 7}; + auto const string_length = std::vector{2, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -83,8 +83,8 @@ TEST_F(TypeInference, Null) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 4}; - auto const string_length = std::vector{0, 2, 1}; + auto const string_offset = std::vector{1, 1, 4}; + auto const string_length = std::vector{0, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -116,8 +116,8 @@ TEST_F(TypeInference, AllNull) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 1}; - auto const string_length = std::vector{0, 0, 4}; + auto const string_offset = std::vector{1, 1, 1}; + auto const string_length = std::vector{0, 0, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -149,8 +149,8 @@ TEST_F(TypeInference, String) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 8, 12}; - auto const string_length = std::vector{6, 3, 4}; + auto const string_offset = std::vector{1, 8, 12}; + auto const string_length = std::vector{6, 3, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -182,8 +182,8 @@ TEST_F(TypeInference, Bool) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 6, 12}; - auto const string_length = std::vector{4, 5, 5}; + auto const string_offset = std::vector{1, 6, 12}; + auto const string_length = std::vector{4, 5, 5}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -215,8 +215,8 @@ TEST_F(TypeInference, Timestamp) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 10}; - auto const string_length = std::vector{8, 9}; + auto const string_offset = std::vector{1, 10}; + auto const string_length = std::vector{8, 9}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -249,8 +249,8 @@ TEST_F(TypeInference, InvalidInput) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 3, 5, 7, 9}; - auto const string_length = std::vector{1, 1, 1, 1, 1}; + auto const string_offset = std::vector{1, 3, 5, 7, 9}; + auto const string_length = std::vector{1, 1, 1, 1, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp index 774f2f7fc40..7dd72ace53c 100644 --- a/cpp/tests/rolling/grouped_rolling_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_test.cpp @@ -33,9 +33,6 @@ #include #include -#include -#include - const std::string cuda_func{ R"***( template @@ -637,7 +634,7 @@ TYPED_TEST(GroupedRollingTest, ZeroWindow) key_1_vec.end()); const cudf::table_view grouping_keys{std::vector{key_0, key_1}}; - cudf::size_type preceding_window = 0; + cudf::size_type preceding_window = 1; cudf::size_type following_window = 0; std::vector expected_group_offsets{0, 4, 8, DATA_SIZE}; diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp new file mode 100644 index 00000000000..ec726878b34 --- /dev/null +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +template +using fwcw = cudf::test::fixed_width_column_wrapper; +template +using decimals_column = cudf::test::fixed_point_column_wrapper; +using ints_column = fwcw; +using bigints_column = fwcw; +using strings_column = cudf::test::strings_column_wrapper; +using lists_column = cudf::test::lists_column_wrapper; +using column_ptr = std::unique_ptr; +using cudf::test::iterators::all_nulls; +using cudf::test::iterators::no_nulls; +using cudf::test::iterators::nulls_at; + +auto constexpr null = int32_t{0}; // NULL representation for int32_t; + +struct OffsetRowWindowTest : public cudf::test::BaseFixture { + static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; + static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + struct rolling_runner { + cudf::window_bounds _preceding, _following; + cudf::size_type _min_periods; + bool _grouped = true; + + rolling_runner(cudf::window_bounds const& preceding, + cudf::window_bounds const& following, + cudf::size_type min_periods_ = 1) + : _preceding{preceding}, _following{following}, _min_periods{min_periods_} + { + } + + rolling_runner& min_periods(cudf::size_type min_periods_) + { + _min_periods = min_periods_; + return *this; + } + + rolling_runner& grouped(bool grouped_) + { + _grouped = grouped_; + return *this; + } + + std::unique_ptr operator()(cudf::rolling_aggregation const& agg) const + { + auto const grouping_keys = + _grouped ? std::vector{_keys} : std::vector{}; + return cudf::grouped_rolling_window( + cudf::table_view{grouping_keys}, _values, _preceding, _following, _min_periods, agg); + } + }; +}; + +ints_column const OffsetRowWindowTest::_keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; +ints_column const OffsetRowWindowTest::_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + +auto const AGG_COUNT_NON_NULL = + cudf::make_count_aggregation(cudf::null_policy::EXCLUDE); +auto const AGG_COUNT_ALL = + cudf::make_count_aggregation(cudf::null_policy::INCLUDE); +auto const AGG_MIN = cudf::make_min_aggregation(); +auto const AGG_MAX = cudf::make_max_aggregation(); +auto const AGG_SUM = cudf::make_sum_aggregation(); +auto const AGG_COLLECT_LIST = cudf::make_collect_list_aggregation(); + +TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_3_to_Minus_1) +{ + auto const preceding = cudf::window_bounds::get(3); + auto const following = cudf::window_bounds::get(-1); + auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL), + ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_MIN), ints_column{{null, 0, 0, 1, 2, 3, null, 6, 6, 7}, nulls_at({0, 6})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_MAX), ints_column{{null, 0, 1, 2, 3, 4, null, 6, 7, 8}, nulls_at({0, 6})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_SUM), + bigints_column{{null, 0, 1, 3, 5, 7, null, 6, 13, 15}, nulls_at({0, 6})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {}, {6}, {6, 7}, {7, 8}}, + nulls_at({0, 6})}); + + run_rolling.min_periods(0); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL), + ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {}, {6}, {6, 7}, {7, 8}}, no_nulls()}); +} + +TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_3_to_Minus_1) +{ + auto const preceding = cudf::window_bounds::get(3); + auto const following = cudf::window_bounds::get(-1); + auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL), + ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MIN), + ints_column{{null, 0, 0, 1, 2, 3, 4, 5, 6, 7}, nulls_at({0})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MAX), + ints_column{{null, 0, 1, 2, 3, 4, 5, 6, 7, 8}, nulls_at({0})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_SUM), bigints_column{{null, 0, 1, 3, 5, 7, 9, 11, 13, 15}, nulls_at({0})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}}, + nulls_at({0})}); + + run_rolling.min_periods(0); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL), + ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}}, + no_nulls()}); +} + +TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) +{ + auto const preceding = cudf::window_bounds::get(0); + auto const following = cudf::window_bounds::get(2); + auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{2, 2, 2, 2, 1, null, 2, 2, 1, null}, nulls_at({5, 9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COUNT_ALL), + ints_column{{2, 2, 2, 2, 1, null, 2, 2, 1, null}, nulls_at({5, 9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_MIN), ints_column{{1, 2, 3, 4, 5, null, 7, 8, 9, null}, nulls_at({5, 9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_MAX), ints_column{{2, 3, 4, 5, 5, null, 8, 9, 9, null}, nulls_at({5, 9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_SUM), + bigints_column{{3, 5, 7, 9, 5, null, 15, 17, 9, null}, nulls_at({5, 9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, + nulls_at({5, 9})}); + + run_rolling.min_periods(0); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{2, 2, 2, 2, 1, 0, 2, 2, 1, 0}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL), + ints_column{{2, 2, 2, 2, 1, 0, 2, 2, 1, 0}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls}); +} + +TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) +{ + auto const preceding = cudf::window_bounds::get(0); + auto const following = cudf::window_bounds::get(2); + auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL), + ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MIN), + ints_column{{1, 2, 3, 4, 5, 6, 7, 8, 9, null}, nulls_at({9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MAX), + ints_column{{2, 3, 4, 5, 6, 7, 8, 9, 9, null}, nulls_at({9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_SUM), bigints_column{{3, 5, 7, 9, 11, 13, 15, 17, 9, null}, nulls_at({9})}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}}, + nulls_at({9})}); + + run_rolling.min_periods(0); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), + ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, 0}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL), + ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, 0}, no_nulls()}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *run_rolling(*AGG_COLLECT_LIST), + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}}, + no_nulls}); +} + +// To test that preceding bounds are clamped correctly at group boundaries. +TEST_F(OffsetRowWindowTest, TestNegativeBoundsClamp) +{ + auto const grp_iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), [](auto const& i) { + return i / 10; // 0-9 in the first group, 10-19 in the second, etc. + }); + auto const agg_iter = thrust::make_constant_iterator(1); + + auto const grp = ints_column(grp_iter, grp_iter + 30); + auto const agg = ints_column(agg_iter, agg_iter + 30); + + auto const min_periods = 0; + auto const rolling_sum = [&](auto const preceding, auto const following) { + return cudf::grouped_rolling_window( + cudf::table_view{{grp}}, agg, preceding, following, min_periods, *AGG_SUM); + }; + + // Testing negative preceding. + for (auto const preceding : {0, -1, -2, -5, -10, -20, -50}) { + auto const results = rolling_sum(preceding, 100); + auto const expected_fun = [&](auto const& i) { + assert(preceding < 1); + auto const index_in_group = i % 10; + auto const start = std::min(-(preceding - 1) + index_in_group, 10); + return int64_t{10 - start}; + }; + auto const expected_iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), expected_fun); + auto const expected = bigints_column(expected_iter, expected_iter + 30, no_nulls()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + + // Testing negative following. + for (auto const following : {-1, -2, -5, -10, -20, -50}) { + auto const results = rolling_sum(100, following); + auto const expected_fun = [&](auto const& i) { + assert(following < 0); + auto const index_in_group = i % 10; + auto const end = std::max(index_in_group + following, -1); + return int64_t{end + 1}; + }; + auto const expected_iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), expected_fun); + auto const expected = bigints_column(expected_iter, expected_iter + 30, no_nulls()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + +TEST_F(OffsetRowWindowTest, CheckGroupBoundaries) +{ + auto grp_iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), [](auto const& i) { + if (i < 10) return 1; + if (i < 20) return 2; + return 3; + }); + auto const grp = ints_column(grp_iter, grp_iter + 30); + auto const agg = ints_column(grp_iter, grp_iter + 30); + { + auto const results = + cudf::grouped_rolling_window(cudf::table_view{{grp}}, + agg, + -80, + 100, + 1, + *cudf::make_max_aggregation()); + auto const null_iter = thrust::make_constant_iterator(null); + auto const expected = ints_column(null_iter, null_iter + 30, all_nulls()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + } + { + auto const results = + cudf::grouped_rolling_window(cudf::table_view{{grp}}, + agg, + -1, + 4, + 1, + *cudf::make_min_aggregation()); + auto const expected = + ints_column{{1, 1, 1, 1, 1, 1, 1, 1, null, null, 2, 2, 2, 2, 2, + 2, 2, 2, null, null, 3, 3, 3, 3, 3, 3, 3, 3, null, null}, + nulls_at({8, 9, 18, 19, 28, 29})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + } +} diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index e410e2488b3..d0181974479 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -148,20 +148,6 @@ TEST_F(RollingStringTest, MinPeriods) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_all, got_count_all->view()); } -TEST_F(RollingStringTest, ZeroWindowSize) -{ - cudf::test::strings_column_wrapper input( - {"This", "is", "rolling", "test", "being", "operated", "on", "string", "column"}, - {1, 0, 0, 1, 0, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper expected_count( - {0, 0, 0, 0, 0, 0, 0, 0, 0}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - - auto got_count = cudf::rolling_window( - input, 0, 0, 0, *cudf::make_count_aggregation()); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, got_count->view()); -} - // ========================================================================================= class RollingStructTest : public cudf::test::BaseFixture {}; @@ -970,6 +956,7 @@ TEST_F(RollingtVarStdTestUntyped, SimpleStaticVarianceStdInfNaN) #undef XXX } +/* // negative sizes TYPED_TEST(RollingTest, NegativeWindowSizes) { @@ -980,10 +967,12 @@ TYPED_TEST(RollingTest, NegativeWindowSizes) std::vector window{3}; std::vector negative_window{-2}; + this->run_test_col_agg(input, negative_window, window, 1); this->run_test_col_agg(input, window, negative_window, 1); this->run_test_col_agg(input, negative_window, negative_window, 1); } + */ // simple example from Pandas docs: TYPED_TEST(RollingTest, SimpleDynamic) @@ -1033,6 +1022,7 @@ TYPED_TEST(RollingTest, AllInvalid) } // window = following_window = 0 +// Note: Preceding includes current row, so its value is set to 1. TYPED_TEST(RollingTest, ZeroWindow) { cudf::size_type num_rows = 1000; @@ -1042,10 +1032,11 @@ TYPED_TEST(RollingTest, ZeroWindow) cudf::test::fixed_width_column_wrapper input( col_data.begin(), col_data.end(), col_mask.begin()); - std::vector window({0}); + std::vector preceding({0}); + std::vector following({1}); cudf::size_type periods = num_rows; - this->run_test_col_agg(input, window, window, periods); + this->run_test_col_agg(input, preceding, following, periods); } // min_periods = 0 diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp new file mode 100644 index 00000000000..f48e64c078e --- /dev/null +++ b/cpp/tests/streams/dictionary_test.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +class DictionaryTest : public cudf::test::BaseFixture {}; + +TEST_F(DictionaryTest, Encode) +{ + cudf::test::fixed_width_column_wrapper col({1, 2, 3, 4, 5}); + cudf::data_type int32_type(cudf::type_id::UINT32); + cudf::column_view col_view = col; + cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream()); +} + +TEST_F(DictionaryTest, Decode) +{ + // keys = {0, 2, 6}, indices = {0, 1, 1, 2, 2} + std::vector elements{0, 2, 2, 6, 6}; + cudf::test::dictionary_column_wrapper dict_col(elements.begin(), elements.end()); + cudf::dictionary_column_view dict_col_view = dict_col; + cudf::dictionary::decode(dict_col_view, cudf::test::get_default_stream()); +} + +TEST_F(DictionaryTest, GetIndex) +{ + std::vector elements{0, 2, 2, 6, 6}; + cudf::test::dictionary_column_wrapper dict_col(elements.begin(), elements.end()); + cudf::dictionary_column_view dict_col_view = dict_col; + cudf::numeric_scalar key_scalar(2, true, cudf::test::get_default_stream()); + cudf::dictionary::get_index(dict_col_view, key_scalar, cudf::test::get_default_stream()); +} + +TEST_F(DictionaryTest, AddKeys) +{ + std::vector elements{0, 2, 2, 6, 6}; + cudf::test::dictionary_column_wrapper dict_col(elements.begin(), elements.end()); + cudf::dictionary_column_view dict_col_view = dict_col; + cudf::test::fixed_width_column_wrapper new_keys_col({8, 9}); + cudf::dictionary::add_keys(dict_col_view, new_keys_col, cudf::test::get_default_stream()); +} + +TEST_F(DictionaryTest, RemoveKeys) +{ + std::vector elements{0, 2, 2, 6, 6}; + cudf::test::dictionary_column_wrapper dict_col(elements.begin(), elements.end()); + cudf::dictionary_column_view dict_col_view = dict_col; + cudf::test::fixed_width_column_wrapper keys_to_remove_col({2}); + cudf::dictionary::remove_keys( + dict_col_view, keys_to_remove_col, cudf::test::get_default_stream()); +} + +TEST_F(DictionaryTest, RemoveUnsedKeys) +{ + std::vector elements{0, 2, 2, 6, 6}; + cudf::test::dictionary_column_wrapper dict_col(elements.begin(), elements.end()); + cudf::dictionary_column_view dict_col_view = dict_col; + cudf::dictionary::remove_unused_keys(dict_col_view, cudf::test::get_default_stream()); +} + +TEST_F(DictionaryTest, SetKeys) +{ + std::vector elements{0, 2, 2, 6, 6}; + cudf::test::dictionary_column_wrapper dict_col(elements.begin(), elements.end()); + cudf::dictionary_column_view dict_col_view = dict_col; + cudf::test::fixed_width_column_wrapper keys_col({2, 6}); + cudf::dictionary::set_keys(dict_col_view, keys_col, cudf::test::get_default_stream()); +} + +TEST_F(DictionaryTest, MatchDictionaries) +{ + std::vector elements_a{0, 2, 2, 6, 6}; + cudf::test::dictionary_column_wrapper dict_col_a(elements_a.begin(), elements_a.end()); + cudf::dictionary_column_view dict_col_view_a = dict_col_a; + + std::vector elements_b{1, 3, 4, 5, 5}; + cudf::test::dictionary_column_wrapper dict_col_b(elements_b.begin(), elements_b.end()); + cudf::dictionary_column_view dict_col_view_b = dict_col_b; + + std::vector dicts = {dict_col_view_a, dict_col_view_b}; + + cudf::test::fixed_width_column_wrapper keys_col({2, 6}); + cudf::dictionary::match_dictionaries(dicts, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp new file mode 100644 index 00000000000..7eac9e016eb --- /dev/null +++ b/cpp/tests/streams/interop_test.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +struct ArrowTest : public cudf::test::BaseFixture {}; + +TEST_F(ArrowTest, ToArrow) +{ + int32_t const value{42}; + auto col = cudf::test::fixed_width_column_wrapper{{value}}; + cudf::table_view tbl{{col}}; + + std::vector metadata{{""}}; + cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, FromArrow) +{ + std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector host_validity = {true, true, true, false, true, true, true}; + + arrow::Int64Builder builder; + auto status = builder.AppendValues(host_values, host_validity); + auto maybe_array = builder.Finish(); + auto array = *maybe_array; + + auto field = arrow::field("", arrow::int32()); + auto schema = arrow::schema({field}); + auto table = arrow::Table::Make(schema, {array}); + cudf::from_arrow(*table, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, ToArrowScalar) +{ + int32_t const value{42}; + auto cudf_scalar = + cudf::make_fixed_width_scalar(value, cudf::test::get_default_stream()); + + cudf::column_metadata metadata{""}; + cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, FromArrowScalar) +{ + int32_t const value{42}; + auto arrow_scalar = arrow::MakeScalar(value); + cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/sorting_test.cpp b/cpp/tests/streams/sorting_test.cpp new file mode 100644 index 00000000000..e481f95bded --- /dev/null +++ b/cpp/tests/streams/sorting_test.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +class SortingTest : public cudf::test::BaseFixture {}; + +TEST_F(SortingTest, SortedOrder) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::sorted_order(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSortedOrder) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::stable_sorted_order(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, IsSorted) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::is_sorted(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, Sort) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::sort(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, SortByKey) +{ + cudf::test::fixed_width_column_wrapper const values_col{10, 20, 30, 40, 50}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const keys_col{10, 20, 30, 40, 50}; + cudf::table_view const keys{{keys_col}}; + + cudf::sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSortByKey) +{ + cudf::test::fixed_width_column_wrapper const values_col{10, 20, 30, 40, 50}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const keys_col{10, 20, 30, 40, 50}; + cudf::table_view const keys{{keys_col}}; + + cudf::stable_sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, Rank) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::rank(column, + cudf::rank_method::AVERAGE, + cudf::order::ASCENDING, + cudf::null_policy::EXCLUDE, + cudf::null_order::AFTER, + false, + cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, SegmentedSortedOrder) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{3, 7}; + + cudf::segmented_sorted_order(keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSegmentedSortedOrder) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{3, 7}; + + cudf::stable_segmented_sorted_order( + keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, SegmentedSortByKey) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{0, 3, 7, 10}; + + cudf::segmented_sort_by_key( + values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSegmentedSortByKey) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{0, 3, 7, 10}; + + cudf::stable_segmented_sort_by_key( + values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp new file mode 100644 index 00000000000..b734a1738cc --- /dev/null +++ b/cpp/tests/streams/strings/find_test.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +class StringsFindTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsFindTest, Find) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream()); + cudf::strings::find(view, target, 0, -1, cudf::test::get_default_stream()); + cudf::strings::rfind(view, target, 0, -1, cudf::test::get_default_stream()); + cudf::strings::find(view, view, 0, cudf::test::get_default_stream()); + cudf::strings::find_multiple(view, view, cudf::test::get_default_stream()); + cudf::strings::contains(view, target, cudf::test::get_default_stream()); + cudf::strings::starts_with(view, target, cudf::test::get_default_stream()); + cudf::strings::starts_with(view, view, cudf::test::get_default_stream()); + cudf::strings::ends_with(view, target, cudf::test::get_default_stream()); + cudf::strings::ends_with(view, view, cudf::test::get_default_stream()); + + auto const pattern = std::string("[a-z]"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::findall(view, *prog, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp new file mode 100644 index 00000000000..bce0d2b680b --- /dev/null +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +class TextNGramsTest : public cudf::test::BaseFixture {}; + +TEST_F(TextNGramsTest, GenerateNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::generate_ngrams( + cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, GenerateCharacterNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + nvtext::generate_character_ngrams( + cudf::strings_column_view(input), 3, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, HashCharacterNgrams) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + nvtext::hash_character_ngrams( + cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, NgramsTokenize) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::ngrams_tokenize( + cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 323b3eed3e2..7b179588385 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams) { cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"}; cudf::strings_column_view strings_view(strings); + auto const separator = cudf::string_scalar("_"); { cudf::test::strings_column_wrapper expected{ "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view); + auto const results = nvtext::generate_ngrams(strings_view, 2, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls) h_strings.begin(), h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto const separator = cudf::string_scalar("_"); cudf::strings_column_view strings_view(strings); { - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); - auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column)); + auto const separator = cudf::string_scalar("_"); + + auto results = + nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator); cudf::test::expect_column_empty(results->view()); results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column)); cudf::test::expect_column_empty(results->view()); @@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty) TEST_F(TextGenerateNgramsTest, Errors) { cudf::test::strings_column_wrapper strings{""}; + auto const separator = cudf::string_scalar("_"); // invalid parameter value - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); // not enough strings to generate ngrams - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); - std::vector h_strings{"", nullptr, "", nullptr}; - cudf::test::strings_column_wrapper strings_no_tokens( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)), + cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0}); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator), cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)), cudf::logic_error); diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp index 5879bec3e64..c6fb886f7e5 100644 --- a/cpp/tests/text/ngrams_tokenize_tests.cpp +++ b/cpp/tests/text/ngrams_tokenize_tests.cpp @@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize) "mousé_ate", "ate_the", "the_cheese"}; - auto results = nvtext::ngrams_tokenize(strings_view); + auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram) { cudf::test::strings_column_wrapper strings{"aaa bbb", " ccc ddd ", "eee"}; cudf::strings_column_view strings_view(strings); + auto const empty = cudf::string_scalar(""); cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"}; - auto results = nvtext::ngrams_tokenize(strings_view, 1); + auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest) { auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); cudf::strings_column_view strings_view(strings->view()); - auto results = nvtext::ngrams_tokenize(strings_view); + auto const empty = cudf::string_scalar(""); + auto results = nvtext::ngrams_tokenize(strings_view, 2, empty, empty); EXPECT_EQ(results->size(), 0); EXPECT_EQ(results->has_nulls(), false); } @@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest) { cudf::test::strings_column_wrapper strings{"this column intentionally left blank"}; cudf::strings_column_view strings_view(strings); - EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error); + auto const empty = cudf::string_scalar(""); + EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error); } diff --git a/dependencies.yaml b/dependencies.yaml index 398ae193fe6..5586f54348c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -218,6 +218,7 @@ dependencies: - libkvikio==23.10.* - output_types: conda packages: + - aws-sdk-cpp<1.11 - fmt>=9.1.0,<10 - &gbench benchmark==1.8.0 - >est gtest>=1.13.0 @@ -259,7 +260,8 @@ dependencies: # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==12.0.1.* - - numpy>=1.21 + # TODO: Pin to numpy<1.25 until cudf requires pandas 2 + - &numpy numpy>=1.21,<1.25 build_python: common: - output_types: [conda, requirements, pyproject] @@ -425,14 +427,15 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - numpy>=1.21 + - *numpy - pandas>=1.3,<1.6.0dev0 run_cudf: common: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba numba>=0.57 + # TODO: Pin to numba<0.58 until #14160 is resolved + - &numba numba>=0.57,<0.58 - nvtx>=0.2.1 - packaging - rmm==23.10.* diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd index e81f0d617fb..88e9d83ee98 100644 --- a/python/cudf/cudf/_lib/cpp/interop.pxd +++ b/python/cudf/cudf/_lib/cpp/interop.pxd @@ -1,12 +1,13 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector -from pyarrow.lib cimport CTable +from pyarrow.lib cimport CScalar, CTable from cudf._lib.types import cudf_to_np_types, np_to_cudf_types +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -24,6 +25,7 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ ) except + cdef unique_ptr[table] from_arrow(CTable input) except + + cdef unique_ptr[scalar] from_arrow(CScalar input) except + cdef cppclass column_metadata: column_metadata() except + @@ -35,3 +37,8 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ table_view input, vector[column_metadata] metadata, ) except + + + cdef shared_ptr[CScalar] to_arrow( + const scalar& input, + column_metadata metadata, + ) except + diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/cpp/reduce.pxd index 7952c717916..997782dec6c 100644 --- a/python/cudf/cudf/_lib/cpp/reduce.pxd +++ b/python/cudf/cudf/_lib/cpp/reduce.pxd @@ -1,14 +1,13 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport pair -from cudf._lib.aggregation cimport reduce_aggregation, scan_aggregation +from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport data_type -from cudf._lib.scalar cimport DeviceScalar cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 8fd2a409d90..639754fc54f 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -4,7 +4,14 @@ from cpython cimport pycapsule from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table +from pyarrow.lib cimport ( + CScalar, + CTable, + pyarrow_unwrap_scalar, + pyarrow_unwrap_table, + pyarrow_wrap_scalar, + pyarrow_wrap_table, +) from cudf._lib.cpp.interop cimport ( DLManagedTensor, @@ -14,12 +21,22 @@ from cudf._lib.cpp.interop cimport ( to_arrow as cpp_to_arrow, to_dlpack as cpp_to_dlpack, ) +from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport type_id +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) +from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.buffer import acquire_spill_lock +from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype def from_dlpack(dlpack_capsule): @@ -182,3 +199,79 @@ def from_arrow(object input_table): c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) return columns_from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def to_arrow_scalar(DeviceScalar source_scalar): + """Convert a scalar to a PyArrow scalar. + + Parameters + ---------- + source_scalar : the scalar to convert + + Returns + ------- + pyarrow.lib.Scalar + """ + cdef vector[column_metadata] cpp_metadata = gather_metadata( + [("", source_scalar.dtype)] + ) + cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr() + + cdef shared_ptr[CScalar] cpp_arrow_scalar + with nogil: + cpp_arrow_scalar = cpp_to_arrow( + source_scalar_ptr[0], cpp_metadata[0] + ) + + return pyarrow_wrap_scalar(cpp_arrow_scalar) + + +@acquire_spill_lock() +def from_arrow_scalar(object input_scalar, output_dtype=None): + """Convert from PyArrow scalar to a cudf scalar. + + Parameters + ---------- + input_scalar : PyArrow scalar + output_dtype : output type to cast to, ignored except for decimals + + Returns + ------- + cudf._lib.DeviceScalar + """ + cdef shared_ptr[CScalar] cpp_arrow_scalar = ( + pyarrow_unwrap_scalar(input_scalar) + ) + cdef unique_ptr[scalar] c_result + + with nogil: + c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0])) + + cdef type_id ctype = c_result.get().type().id() + if ctype == type_id.DECIMAL128: + if output_dtype is None: + # Decimals must be cast to the cudf dtype of the right width + raise ValueError( + "Decimal scalars must be constructed with a dtype" + ) + + if isinstance(output_dtype, Decimal32Dtype): + c_result.reset( + new fixed_point_scalar[decimal32]( + ( c_result.get()).value(), + scale_type(-input_scalar.type.scale), + c_result.get().is_valid() + ) + ) + elif isinstance(output_dtype, Decimal64Dtype): + c_result.reset( + new fixed_point_scalar[decimal64]( + ( c_result.get()).value(), + scale_type(-input_scalar.type.scale), + c_result.get().is_valid() + ) + ) + # Decimal128Dtype is a no-op, no conversion needed. + + return DeviceScalar.from_unique_ptr(move(c_result), output_dtype) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0407785b2d8..5ab286c5701 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -2,22 +2,13 @@ cimport cython -import decimal +import copy import numpy as np import pandas as pd import pyarrow as pa -from libc.stdint cimport ( - int8_t, - int16_t, - int32_t, - int64_t, - uint8_t, - uint16_t, - uint32_t, - uint64_t, -) +from libc.stdint cimport int64_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -25,38 +16,22 @@ from libcpp.utility cimport move from rmm._lib.memory_resource cimport get_current_device_resource import cudf -from cudf._lib.types import ( - LIBCUDF_TO_SUPPORTED_NUMPY_TYPES, - datetime_unit_map, - duration_unit_map, -) +from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT -from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id -from cudf._lib.interop import from_arrow, to_arrow +from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, - fixed_point_scalar, list_scalar, - numeric_scalar, scalar, - string_scalar, struct_scalar, timestamp_scalar, ) -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) from cudf._lib.cpp.wrappers.durations cimport ( duration_ms, duration_ns, @@ -69,7 +44,21 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) -from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns + + +def _replace_nested(obj, check, replacement): + if isinstance(obj, list): + for i, item in enumerate(obj): + if check(item): + obj[i] = replacement + elif isinstance(item, (dict, list)): + _replace_nested(item, check, replacement) + elif isinstance(obj, dict): + for k, v in obj.items(): + if check(v): + obj[k] = replacement + elif isinstance(v, (dict, list)): + _replace_nested(v, check, replacement) # The DeviceMemoryResource attribute could be released prematurely @@ -97,61 +86,61 @@ cdef class DeviceScalar: A NumPy dtype. """ self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') - self._set_value(value, self._dtype) - - def _set_value(self, value, dtype): - # IMPORTANT: this should only ever be called from __init__ - valid = not _is_null_host_scalar(value) - - if isinstance(dtype, cudf.core.dtypes.DecimalDtype): - _set_decimal_from_scalar( - self.c_value, value, dtype, valid) - elif isinstance(dtype, cudf.ListDtype): - _set_list_from_pylist( - self.c_value, value, dtype, valid) - elif isinstance(dtype, cudf.StructDtype): - _set_struct_from_pydict(self.c_value, value, dtype, valid) + + if cudf.utils.utils.is_na_like(value): + value = None + else: + # TODO: For now we always deepcopy the input value to avoid + # overwriting the input values when replacing nulls. Since it's + # just host values it's not that expensive, but we could consider + # alternatives. + value = copy.deepcopy(value) + _replace_nested(value, cudf.utils.utils.is_na_like, None) + + if isinstance(dtype, cudf.core.dtypes._BaseDtype): + pa_type = dtype.to_arrow() elif pd.api.types.is_string_dtype(dtype): - _set_string_from_np_string(self.c_value, value, valid) - elif pd.api.types.is_numeric_dtype(dtype): - _set_numeric_from_np_scalar(self.c_value, - value, - dtype, - valid) - elif pd.api.types.is_datetime64_dtype(dtype): - _set_datetime64_from_np_scalar( - self.c_value, value, dtype, valid - ) - elif pd.api.types.is_timedelta64_dtype(dtype): - _set_timedelta64_from_np_scalar( - self.c_value, value, dtype, valid - ) + # Have to manually convert object types, which we use internally + # for strings but pyarrow only supports as unicode 'U' + pa_type = pa.string() else: - raise ValueError( - f"Cannot convert value of type " - f"{type(value).__name__} to cudf scalar" - ) + pa_type = pa.from_numpy_dtype(dtype) + + pa_scalar = pa.scalar(value, type=pa_type) + + # Note: This factory-like behavior in __init__ will be removed when + # migrating to pylibcudf. + cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype) + self.c_value.swap(obj.c_value) def _to_host_scalar(self): - if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): - result = _get_py_decimal_from_fixed_point(self.c_value) - elif cudf.api.types.is_struct_dtype(self.dtype): - result = _get_py_dict_from_struct(self.c_value, self.dtype) - elif cudf.api.types.is_list_dtype(self.dtype): - result = _get_py_list_from_list(self.c_value, self.dtype) - elif pd.api.types.is_string_dtype(self.dtype): - result = _get_py_string_from_string(self.c_value) - elif pd.api.types.is_numeric_dtype(self.dtype): - result = _get_np_scalar_from_numeric(self.c_value) - elif pd.api.types.is_datetime64_dtype(self.dtype): - result = _get_np_scalar_from_timestamp64(self.c_value) - elif pd.api.types.is_timedelta64_dtype(self.dtype): - result = _get_np_scalar_from_timedelta64(self.c_value) + is_datetime = self.dtype.kind == "M" + is_timedelta = self.dtype.kind == "m" + + null_type = NaT if is_datetime or is_timedelta else NA + + ps = to_arrow_scalar(self) + if not ps.is_valid: + return null_type + + # TODO: The special handling of specific types below does not currently + # extend to nested types containing those types (e.g. List[timedelta] + # where the timedelta would overflow). We should eventually account for + # those cases, but that will require more careful consideration of how + # to traverse the contents of the nested data. + if is_datetime or is_timedelta: + time_unit, _ = np.datetime_data(self.dtype) + # Cast to int64 to avoid overflow + ps_cast = ps.cast('int64').as_py() + out_type = np.datetime64 if is_datetime else np.timedelta64 + ret = out_type(ps_cast, time_unit) + elif cudf.api.types.is_numeric_dtype(self.dtype): + ret = ps.type.to_pandas_dtype()(ps.as_py()) else: - raise ValueError( - "Could not convert cudf::scalar to a Python value" - ) - return result + ret = ps.as_py() + + _replace_nested(ret, lambda item: item is None, NA) + return ret @property def dtype(self): @@ -236,42 +225,9 @@ cdef class DeviceScalar: return s -cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): - value = value if valid else "" - s.reset(new string_scalar(value.encode(), valid)) - - -cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - value = value if valid else 0 - if dtype == "int8": - s.reset(new numeric_scalar[int8_t](value, valid)) - elif dtype == "int16": - s.reset(new numeric_scalar[int16_t](value, valid)) - elif dtype == "int32": - s.reset(new numeric_scalar[int32_t](value, valid)) - elif dtype == "int64": - s.reset(new numeric_scalar[int64_t](value, valid)) - elif dtype == "uint8": - s.reset(new numeric_scalar[uint8_t](value, valid)) - elif dtype == "uint16": - s.reset(new numeric_scalar[uint16_t](value, valid)) - elif dtype == "uint32": - s.reset(new numeric_scalar[uint32_t](value, valid)) - elif dtype == "uint64": - s.reset(new numeric_scalar[uint64_t](value, valid)) - elif dtype == "float32": - s.reset(new numeric_scalar[float](value, valid)) - elif dtype == "float64": - s.reset(new numeric_scalar[double](value, valid)) - elif dtype == "bool": - s.reset(new numeric_scalar[bool](value, valid)) - else: - raise ValueError(f"dtype not supported: {dtype}") - - +# TODO: Currently the only uses of this function and the one below are in +# _create_proxy_nat_scalar. See if that code path can be simplified to excise +# or at least simplify these implementations. cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, object value, object dtype, @@ -324,253 +280,6 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, else: raise ValueError(f"dtype not supported: {dtype}") -cdef _set_decimal_from_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0 - if isinstance(dtype, cudf.Decimal64Dtype): - s.reset( - new fixed_point_scalar[decimal64]( - np.int64(value), scale_type(-dtype.scale), valid - ) - ) - elif isinstance(dtype, cudf.Decimal32Dtype): - s.reset( - new fixed_point_scalar[decimal32]( - np.int32(value), scale_type(-dtype.scale), valid - ) - ) - elif isinstance(dtype, cudf.Decimal128Dtype): - s.reset( - new fixed_point_scalar[decimal128]( - value, scale_type(-dtype.scale), valid - ) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - -cdef _set_struct_from_pydict(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - arrow_schema = dtype.to_arrow() - columns = [str(i) for i in range(len(arrow_schema))] - if valid: - pyarrow_table = pa.Table.from_arrays( - [ - pa.array([value[f.name]], from_pandas=True, type=f.type) - for f in arrow_schema - ], - names=columns - ) - else: - pyarrow_table = pa.Table.from_arrays( - [ - pa.array([NA], from_pandas=True, type=f.type) - for f in arrow_schema - ], - names=columns - ) - - data = from_arrow(pyarrow_table) - cdef table_view struct_view = table_view_from_columns(data) - - s.reset( - new struct_scalar(struct_view, valid) - ) - -cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype): - if not s.get()[0].is_valid(): - return NA - - cdef table_view struct_table_view = (s.get()).view() - columns = columns_from_table_view(struct_table_view, None) - struct_col = cudf.core.column.build_struct_column( - names=dtype.fields.keys(), - children=tuple(columns), - size=1, - ) - table = to_arrow([struct_col], [("None", dtype)]) - python_dict = table.to_pydict()["None"][0] - return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict} - -cdef _set_list_from_pylist(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else [NA] - cdef Column col - if isinstance(dtype.element_type, ListDtype): - pa_type = dtype.element_type.to_arrow() - else: - pa_type = dtype.to_arrow().value_type - col = cudf.core.column.as_column( - pa.array(value, from_pandas=True, type=pa_type) - ) - cdef column_view col_view = col.view() - s.reset( - new list_scalar(col_view, valid) - ) - - -cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype): - - if not s.get()[0].is_valid(): - return NA - - cdef column_view list_col_view = (s.get()).view() - cdef Column element_col = Column.from_column_view(list_col_view, None) - - arrow_obj = to_arrow([element_col], [("None", dtype.element_type)])["None"] - - result = arrow_obj.to_pylist() - return _nested_na_replace(result) - - -cdef _get_py_string_from_string(unique_ptr[scalar]& s): - if not s.get()[0].is_valid(): - return NA - return (s.get())[0].to_string().decode() - - -cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s): - cdef scalar* s_ptr = s.get() - if not s_ptr[0].is_valid(): - return NA - - cdef libcudf_types.data_type cdtype = s_ptr[0].type() - - if cdtype.id() == libcudf_types.type_id.INT8: - return np.int8((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.INT16: - return np.int16((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.INT32: - return np.int32((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.INT64: - return np.int64((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.UINT8: - return np.uint8((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.UINT16: - return np.uint16((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.UINT32: - return np.uint32((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.UINT64: - return np.uint64((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.FLOAT32: - return np.float32((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.FLOAT64: - return np.float64((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.type_id.BOOL8: - return np.bool_((s_ptr)[0].value()) - else: - raise ValueError("Could not convert cudf::scalar to numpy scalar") - - -cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s): - cdef scalar* s_ptr = s.get() - if not s_ptr[0].is_valid(): - return NA - - cdef libcudf_types.data_type cdtype = s_ptr[0].type() - - if cdtype.id() == libcudf_types.type_id.DECIMAL64: - rep_val = int((s_ptr)[0].value()) - scale = int((s_ptr)[0].type().scale()) - return decimal.Decimal(rep_val).scaleb(scale) - elif cdtype.id() == libcudf_types.type_id.DECIMAL32: - rep_val = int((s_ptr)[0].value()) - scale = int((s_ptr)[0].type().scale()) - return decimal.Decimal(rep_val).scaleb(scale) - elif cdtype.id() == libcudf_types.type_id.DECIMAL128: - rep_val = int((s_ptr)[0].value()) - scale = int((s_ptr)[0].type().scale()) - return decimal.Decimal(rep_val).scaleb(scale) - else: - raise ValueError("Could not convert cudf::scalar to numpy scalar") - -cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s): - - cdef scalar* s_ptr = s.get() - - if not s_ptr[0].is_valid(): - return NaT - - cdef libcudf_types.data_type cdtype = s_ptr[0].type() - - if cdtype.id() == libcudf_types.type_id.TIMESTAMP_SECONDS: - return np.datetime64( - ( - s_ptr - )[0].ticks_since_epoch_64(), - datetime_unit_map[(cdtype.id())] - ) - elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MILLISECONDS: - return np.datetime64( - ( - s_ptr - )[0].ticks_since_epoch_64(), - datetime_unit_map[(cdtype.id())] - ) - elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MICROSECONDS: - return np.datetime64( - ( - s_ptr - )[0].ticks_since_epoch_64(), - datetime_unit_map[(cdtype.id())] - ) - elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_NANOSECONDS: - return np.datetime64( - ( - s_ptr - )[0].ticks_since_epoch_64(), - datetime_unit_map[(cdtype.id())] - ) - else: - raise ValueError("Could not convert cudf::scalar to numpy scalar") - - -cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s): - - cdef scalar* s_ptr = s.get() - - if not s_ptr[0].is_valid(): - return NaT - - cdef libcudf_types.data_type cdtype = s_ptr[0].type() - - if cdtype.id() == libcudf_types.type_id.DURATION_SECONDS: - return np.timedelta64( - ( - s_ptr - )[0].ticks(), - duration_unit_map[(cdtype.id())] - ) - elif cdtype.id() == libcudf_types.type_id.DURATION_MILLISECONDS: - return np.timedelta64( - ( - s_ptr - )[0].ticks(), - duration_unit_map[(cdtype.id())] - ) - elif cdtype.id() == libcudf_types.type_id.DURATION_MICROSECONDS: - return np.timedelta64( - ( - s_ptr - )[0].ticks(), - duration_unit_map[(cdtype.id())] - ) - elif cdtype.id() == libcudf_types.type_id.DURATION_NANOSECONDS: - return np.timedelta64( - ( - s_ptr - )[0].ticks(), - duration_unit_map[(cdtype.id())] - ) - else: - raise ValueError("Could not convert cudf::scalar to numpy scalar") - def as_device_scalar(val, dtype=None): if isinstance(val, (cudf.Scalar, DeviceScalar)): @@ -607,16 +316,3 @@ def _create_proxy_nat_scalar(dtype): return result else: raise TypeError('NAT only valid for datetime and timedelta') - - -def _nested_na_replace(input_list): - ''' - Replace `None` with `cudf.NA` in the result of - `__getitem__` calls to list type columns - ''' - for idx, value in enumerate(input_list): - if isinstance(value, list): - _nested_na_replace(value) - elif value is None: - input_list[idx] = NA - return input_list diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index a472142ece0..25d58029d6b 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -4,12 +4,13 @@ import cupy as cp import numpy as np +from cudf.core.column import as_column from cudf.core.copy_types import BooleanMask -from cudf.core.index import Index, RangeIndex +from cudf.core.index import RangeIndex, as_index from cudf.core.indexed_frame import IndexedFrame from cudf.core.scalar import Scalar -from cudf.core.series import Series from cudf.options import get_option +from cudf.utils.dtypes import can_convert_to_column def factorize( @@ -95,7 +96,13 @@ def factorize( return_cupy_array = isinstance(values, cp.ndarray) - values = Series(values) + if not can_convert_to_column(values): + raise TypeError( + "'values' can only be a Series, Index, or CuPy array, " + f"got {type(values)}" + ) + + values = as_column(values) if na_sentinel is None: na_sentinel = ( @@ -128,22 +135,22 @@ def factorize( warnings.warn("size_hint is not applicable for cudf.factorize") if use_na_sentinel is None or use_na_sentinel: - cats = values._column.dropna() + cats = values.dropna() else: - cats = values._column + cats = values cats = cats.unique().astype(values.dtype) if sort: cats = cats.sort_values() - labels = values._column._label_encoding( + labels = values._label_encoding( cats=cats, na_sentinel=Scalar(na_sentinel), dtype="int64" if get_option("mode.pandas_compatible") else None, ).values - return labels, cats.values if return_cupy_array else Index(cats) + return labels, cats.values if return_cupy_array else as_index(cats) def _linear_interpolation(column, index=None): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d2e2f11a12e..0bc50a521e2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1390,20 +1390,19 @@ def _return_sentinel_column(): except ValueError: return _return_sentinel_column() - codes = arange(len(cats), dtype=dtype) left_gather_map, right_gather_map = cpp_join( [self], [cats], how="left" ) - codes = codes.take( - right_gather_map, nullify=True, check_bounds=False - ).fillna(na_sentinel.value) - + codes = libcudf.copying.gather( + [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True + ) + del right_gather_map # reorder `codes` so that its values correspond to the # values of `self`: - order = arange(len(self)) - order = order.take(left_gather_map, check_bounds=False).argsort() - codes = codes.take(order) - return codes + (codes,) = libcudf.sort.sort_by_key( + codes, [left_gather_map], [True], ["last"], stable=True + ) + return codes.fillna(na_sentinel.value) def column_empty_like( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 84c16b71997..8a3dbe77787 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1390,10 +1390,21 @@ def _get_numeric_data(self): return self[columns] @_cudf_nvtx_annotate - def assign(self, **kwargs): + def assign(self, **kwargs: Union[Callable[[Self], Any], Any]): """ Assign columns to DataFrame from keyword arguments. + Parameters + ---------- + **kwargs: dict mapping string column names to values + The value for each key can either be a literal column (or + something that can be converted to a column), or + a callable of one argument that will be given the + dataframe as an argument and should return the new column + (without modifying the input argument). + Columns are added in-order, so callables can refer to + column names constructed in the assignment. + Examples -------- >>> import cudf @@ -1405,15 +1416,9 @@ def assign(self, **kwargs): 1 1 4 2 2 5 """ - new_df = cudf.DataFrame(index=self.index.copy()) - for name, col in self._data.items(): - if name in kwargs: - new_df[name] = kwargs.pop(name) - else: - new_df._data[name] = col.copy() - + new_df = self.copy(deep=False) for k, v in kwargs.items(): - new_df[k] = v + new_df[k] = v(new_df) if callable(v) else v return new_df @classmethod @@ -5607,7 +5612,7 @@ def quantile( result.name = q return result - result.index = list(map(float, qs)) + result.index = cudf.Index(list(map(float, qs)), dtype="float64") return result @_cudf_nvtx_annotate @@ -7885,9 +7890,7 @@ def _get_union_of_indices(indexes): return indexes[0] else: merged_index = cudf.core.index.GenericIndex._concat(indexes) - merged_index = merged_index.drop_duplicates() - inds = merged_index._values.argsort() - return merged_index.take(inds) + return merged_index.drop_duplicates() def _get_union_of_series_names(series_list): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6224793d6f1..1e6d177f8ca 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -437,7 +437,7 @@ def get_column_values_na(col): ncol = self._num_columns if ncol == 0: return make_empty_matrix( - shape=(0, 0), dtype=np.dtype("float64"), order="F" + shape=(len(self), ncol), dtype=np.dtype("float64"), order="F" ) if dtype is None: diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b300c55b537..e1740140b44 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2336,6 +2336,170 @@ def pct_change( shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 + def value_counts( + self, + subset=None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrameOrSeries: + """ + Return a Series or DataFrame containing counts of unique rows. + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will + have an additional column with the value_counts. The column is + labelled 'count' or 'proportion', depending on the ``normalize`` + parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ + + df = cudf.DataFrame.copy(self.obj) + groupings = self.grouping.names + name = "proportion" if normalize else "count" + + if subset is None: + subset = [i for i in df._column_names if i not in groupings] + # Check subset exists in dataframe + elif set(subset) - set(df._column_names): + raise ValueError( + f"Keys {set(subset) - set(df._column_names)} in subset " + f"do not exist in the DataFrame." + ) + # Catch case where groupby and subset share an element + elif set(subset) & set(groupings): + raise ValueError( + f"Keys {set(subset) & set(groupings)} in subset " + "cannot be in the groupby column keys." + ) + + df["__placeholder"] = 1 + result = ( + df.groupby(groupings + list(subset), dropna=dropna)[ + "__placeholder" + ] + .count() + .sort_index() + .astype(np.int64) + ) + + if normalize: + levels = list(range(len(groupings), result.index.nlevels)) + result /= result.groupby( + result.index.droplevel(levels), + ).transform("sum") + + if sort: + result = result.sort_values(ascending=ascending).sort_index( + level=range(len(groupings)), sort_remaining=False + ) + + if not self._as_index: + if name in df._column_names: + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) + result.name = name + result = result.to_frame().reset_index() + else: + result.name = name + + return result + def _mimic_pandas_order( self, result: DataFrameOrSeries ) -> DataFrameOrSeries: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 56ec9ce0359..de8a5948033 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -13,6 +13,7 @@ List, MutableMapping, Optional, + Sequence, Tuple, Type, Union, @@ -3467,7 +3468,7 @@ def __new__( "tupleize_cols != True is not yet supported" ) - return as_index( + res = as_index( data, copy=copy, dtype=dtype, @@ -3475,6 +3476,15 @@ def __new__( nan_as_null=nan_as_null, **kwargs, ) + if ( + isinstance(data, Sequence) + and not isinstance(data, range) + and len(data) == 0 + and dtype is None + and getattr(data, "dtype", None) is None + ): + return res.astype("str") + return res @classmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 62e091b29b5..aacf1fa8dae 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5438,6 +5438,13 @@ def _is_same_dtype(lhs_dtype, rhs_dtype): # for matching column dtype. if lhs_dtype == rhs_dtype: return True + elif ( + is_categorical_dtype(lhs_dtype) + and is_categorical_dtype(rhs_dtype) + and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype + ): + # OK if categories are not all the same + return True elif ( is_categorical_dtype(lhs_dtype) and not is_categorical_dtype(rhs_dtype) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7692d3015f8..a195738af54 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,7 +9,16 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union +from typing import ( + Any, + Dict, + MutableMapping, + Optional, + Sequence, + Set, + Tuple, + Union, +) import cupy import numpy as np @@ -500,6 +509,18 @@ def __init__( copy=False, nan_as_null=True, ): + if ( + isinstance(data, Sequence) + and len(data) == 0 + and dtype is None + and getattr(data, "dtype", None) is None + ): + warnings.warn( + "The default dtype for empty Series will be 'object' instead " + "of 'float64' in a future version. Specify a dtype explicitly " + "to silence this warning.", + FutureWarning, + ) if isinstance(data, pd.Series): if name is None: name = data.name @@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None): 3 NaN dtype: float64 """ - return cls(s, nan_as_null=nan_as_null) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + result = cls(s, nan_as_null=nan_as_null) + return result @property # type: ignore @_cudf_nvtx_annotate @@ -2642,7 +2666,9 @@ def mode(self, dropna=True): if len(val_counts) > 0: val_counts = val_counts[val_counts == val_counts.iloc[0]] - return Series(val_counts.index.sort_values(), name=self.name) + return Series._from_data( + {self.name: val_counts.index.sort_values()}, name=self.name + ) @_cudf_nvtx_annotate def round(self, decimals=0, how="half_even"): diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index e949f7d78e7..9182246826f 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -397,8 +397,12 @@ def assert_column_memory_ne( raise AssertionError("lhs and rhs holds the same memory.") -def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs): - # Wrapper around pd.Series using a float64 default dtype for empty data. +def _create_pandas_series_float64_default( + data=None, index=None, dtype=None, *args, **kwargs +): + # Wrapper around pd.Series using a float64 + # default dtype for empty data to silence warnings. + # TODO: Remove this in pandas-2.0 upgrade if dtype is None and ( data is None or (not is_scalar(data) and len(data) == 0) ): @@ -406,6 +410,19 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs): return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs) +def _create_cudf_series_float64_default( + data=None, index=None, dtype=None, *args, **kwargs +): + # Wrapper around cudf.Series using a float64 + # default dtype for empty data to silence warnings. + # TODO: Remove this in pandas-2.0 upgrade + if dtype is None and ( + data is None or (not is_scalar(data) and len(data) == 0) + ): + dtype = "float64" + return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs) + + parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( "left_dtype,right_dtype", list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index cbef9bfa2d8..67b63028fab 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -30,6 +30,7 @@ ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, + _create_cudf_series_float64_default, assert_eq, assert_exceptions_equal, assert_neq, @@ -220,6 +221,18 @@ def test_init_unaligned_with_index(): assert_eq(pdf, gdf, check_dtype=False) +def test_init_series_list_columns_unsort(): + pseries = [ + pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] + gseries = [ + cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] + pdf = pd.DataFrame(pseries) + gdf = cudf.DataFrame(gseries) + assert_eq(pdf, gdf) + + def test_series_basic(): # Make series from buffer a1 = np.arange(10, dtype=np.float64) @@ -1314,6 +1327,25 @@ def test_assign(): np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4]) +@pytest.mark.parametrize( + "mapping", + [ + {"y": 1, "z": lambda df: df["x"] + df["y"]}, + { + "x": lambda df: df["x"] * 2, + "y": lambda df: 2, + "z": lambda df: df["x"] / df["y"], + }, + ], +) +def test_assign_callable(mapping): + df = pd.DataFrame({"x": [1, 2, 3]}) + cdf = cudf.from_pandas(df) + expect = df.assign(**mapping) + actual = cdf.assign(**mapping) + assert_eq(expect, actual) + + @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) @pytest.mark.parametrize("method", ["murmur3", "md5"]) @pytest.mark.parametrize("seed", [None, 42]) @@ -2000,8 +2032,8 @@ def test_series_shape(): def test_series_shape_empty(): - ps = pd.Series(dtype="float64") - cs = cudf.Series([]) + ps = pd.Series([], dtype="float64") + cs = cudf.Series([], dtype="float64") assert ps.shape == cs.shape @@ -2840,7 +2872,7 @@ def test_series_all_null(num_elements, null_type): @pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) def test_series_all_valid_nan(num_elements): data = [np.nan] * num_elements - sr = cudf.Series(data, nan_as_null=False) + sr = _create_cudf_series_float64_default(data, nan_as_null=False) np.testing.assert_equal(sr.null_count, 0) @@ -4073,28 +4105,28 @@ def test_empty_dataframe_describe(): def test_as_column_types(): - col = column.as_column(cudf.Series([])) + col = column.as_column(cudf.Series([], dtype="float64")) assert_eq(col.dtype, np.dtype("float64")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="float64")) assert_eq(pds, gds) - col = column.as_column(cudf.Series([]), dtype="float32") + col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32") assert_eq(col.dtype, np.dtype("float32")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="float32")) assert_eq(pds, gds) - col = column.as_column(cudf.Series([]), dtype="str") + col = column.as_column(cudf.Series([], dtype="float64"), dtype="str") assert_eq(col.dtype, np.dtype("object")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="str")) assert_eq(pds, gds) - col = column.as_column(cudf.Series([]), dtype="object") + col = column.as_column(cudf.Series([], dtype="float64"), dtype="object") assert_eq(col.dtype, np.dtype("object")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="object")) @@ -4469,7 +4501,7 @@ def test_create_dataframe_column(): ) def test_series_values_host_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = cudf.Series(data) + gds = _create_cudf_series_float64_default(data) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -4492,7 +4524,7 @@ def test_series_values_host_property(data): ) def test_series_values_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = cudf.Series(data) + gds = _create_cudf_series_float64_default(data) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) np.testing.assert_array_equal(gds_vals.get(), pds.values) @@ -10374,3 +10406,22 @@ def test_dataframe_init_from_nested_dict(): pdf = pd.DataFrame(regular_dict) gdf = cudf.DataFrame(regular_dict) assert_eq(pdf, gdf) + + +def test_init_from_2_categoricalindex_series_diff_categories(): + s1 = cudf.Series( + [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"]) + ) + s2 = cudf.Series( + [2, 152, 2, 242, 150], + index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]), + ) + result = cudf.DataFrame([s1, s2]) + expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()]) + assert_eq(result, expected, check_dtype=False) + + +def test_data_frame_values_no_cols_but_index(): + result = cudf.DataFrame(index=range(5)).values + expected = pd.DataFrame(index=range(5)).values + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 3277e52edb3..1def6597706 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -1,11 +1,14 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest import cudf -from cudf.testing._utils import _create_pandas_series, assert_eq +from cudf.testing._utils import ( + _create_pandas_series_float64_default, + assert_eq, +) @pytest.mark.parametrize( @@ -22,7 +25,7 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index f77e7b4d775..ddbfdf5eee2 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -10,7 +10,7 @@ import cudf from cudf import concat from cudf.testing._utils import ( - _create_pandas_series, + _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, ) @@ -62,7 +62,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = _create_pandas_series(data) + pds = _create_pandas_series_float64_default(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 042f0e1aa38..376639d5226 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3473,3 +3473,70 @@ def test_categorical_grouping_pandas_compatibility(): expected = pdf.groupby("key", sort=False).sum() assert_eq(actual, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", np.nan, "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = pdf.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_groupby_results_equal( + actual, expected, check_names=False, check_index_type=False + ) + + +def test_group_by_value_counts_subset(): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender").value_counts(["education"]) + expected = pdf.groupby("gender").value_counts(["education"]) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_groupby_results_equal( + actual, expected, check_names=False, check_index_type=False + ) + + +def test_group_by_value_counts_clash_with_subset(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["a"]) + + +def test_group_by_value_counts_subset_not_exists(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["c"]) + + +def test_group_by_value_counts_with_count_column(): + df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a", as_index=False).value_counts() diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index b3791cddce3..29232f63e90 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -30,7 +30,8 @@ SIGNED_INTEGER_TYPES, SIGNED_TYPES, UNSIGNED_TYPES, - _create_pandas_series, + _create_cudf_series_float64_default, + _create_pandas_series_float64_default, assert_column_memory_eq, assert_column_memory_ne, assert_eq, @@ -1006,8 +1007,8 @@ def test_index_equal_misc(data, other): actual = gd_data.equals(np.array(gd_other)) assert_eq(expected, actual) - expected = pd_data.equals(_create_pandas_series(pd_other)) - actual = gd_data.equals(cudf.Series(gd_other)) + expected = pd_data.equals(_create_pandas_series_float64_default(pd_other)) + actual = gd_data.equals(_create_cudf_series_float64_default(gd_other)) assert_eq(expected, actual) expected = pd_data.astype("category").equals(pd_other) @@ -2275,7 +2276,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): ], ) def test_isin_index(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.index.isin(values) @@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype): assert_eq(pidx, gidx) +def test_empty_index_init(): + pidx = pd.Index([]) + gidx = cudf.Index([]) + + assert_eq(pidx, gidx) + + @pytest.mark.parametrize( "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] ) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5dd58d8a875..ac10dd97c56 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -895,14 +895,14 @@ def test_memory_usage(): "data, idx", [ ( - [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": None}]], + [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]], 0, ), ( [ [ {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"}, - {"f1": "sf12", "f2": None}, + {"f1": "sf12", "f2": NA}, ] ], 0, diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index b4e0983a9e3..43fa83e1735 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -9,7 +9,10 @@ import cudf from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 -from cudf.testing._utils import _create_pandas_series, assert_eq +from cudf.testing._utils import ( + _create_pandas_series_float64_default, + assert_eq, +) from cudf.testing.dataset_generator import rand_dataframe @@ -55,7 +58,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = _create_pandas_series(data, index=index) + psr = _create_pandas_series_float64_default(data, index=index) gsr = cudf.Series(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): @@ -313,7 +316,7 @@ def test_rolling_getitem_window(): @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = _create_pandas_series(data, index=index) + psr = _create_pandas_series_float64_default(data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b1e991106ee..cfa571a0f54 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -19,7 +19,8 @@ NUMERIC_TYPES, SERIES_OR_INDEX_NAMES, TIMEDELTA_TYPES, - _create_pandas_series, + _create_cudf_series_float64_default, + _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -400,8 +401,8 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = _create_pandas_series(data) - gsr = cudf.Series(data) + psr = _create_pandas_series_float64_default(data) + gsr = _create_cudf_series_float64_default(data) assert_eq(psr.size, gsr.size) @@ -487,7 +488,7 @@ def test_series_describe_other_types(ps): ) @pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0]) def test_series_factorize(data, na_sentinel): - gsr = cudf.Series(data) + gsr = _create_cudf_series_float64_default(data) psr = gsr.to_pandas() with pytest.warns(FutureWarning): @@ -510,7 +511,7 @@ def test_series_factorize(data, na_sentinel): ) @pytest.mark.parametrize("use_na_sentinel", [True, False]) def test_series_factorize_use_na_sentinel(data, use_na_sentinel): - gsr = cudf.Series(data) + gsr = _create_cudf_series_float64_default(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize( @@ -534,7 +535,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel): ) @pytest.mark.parametrize("sort", [True, False]) def test_series_factorize_sort(data, sort): - gsr = cudf.Series(data) + gsr = _create_cudf_series_float64_default(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize(sort=sort) @@ -734,7 +735,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): ], dtype="datetime64[ns]", ), - cudf.Series(name="empty series"), + cudf.Series(name="empty series", dtype="float64"), cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"), ], ) @@ -1415,7 +1416,7 @@ def test_series_hash_values_invalid_method(): def test_set_index_unequal_length(): - s = cudf.Series() + s = cudf.Series(dtype="float64") with pytest.raises(ValueError): s.index = [1, 2, 3] @@ -1682,7 +1683,7 @@ def test_series_nunique_index(data): ], ) def test_axes(data): - csr = cudf.Series(data) + csr = _create_cudf_series_float64_default(data) psr = csr.to_pandas() expected = psr.axes @@ -1760,7 +1761,7 @@ def test_series_truncate_datetimeindex(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = _create_pandas_series(data, index=index) + psr = _create_pandas_series_float64_default(data, index=index) gsr = cudf.Series.from_pandas(psr, nan_as_null=False) expected = psr.isin(values) @@ -1820,7 +1821,7 @@ def test_fill_new_category(): ], ) def test_isin_datetime(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1849,7 +1850,7 @@ def test_isin_datetime(data, values): ], ) def test_isin_string(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1878,7 +1879,7 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -2099,7 +2100,7 @@ def test_series_to_dict(into): ], ) def test_series_hasnans(data): - gs = cudf.Series(data, nan_as_null=False) + gs = _create_cudf_series_float64_default(data, nan_as_null=False) ps = gs.to_pandas(nullable=True) assert_eq(gs.hasnans, ps.hasnans) @@ -2170,8 +2171,8 @@ def test_series_init_dict_with_index(data, index): "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] ) def test_series_init_scalar_with_index(data, index): - pandas_series = _create_pandas_series(data, index=index) - cudf_series = cudf.Series(data, index=index) + pandas_series = _create_pandas_series_float64_default(data, index=index) + cudf_series = _create_cudf_series_float64_default(data, index=index) assert_eq( pandas_series, @@ -2313,7 +2314,15 @@ def test_series_round_builtin(data, digits): assert_eq(expected, actual) +def test_series_empty_warning(): + with pytest.warns(FutureWarning): + expected = pd.Series([]) + with pytest.warns(FutureWarning): + actual = cudf.Series([]) + assert_eq(expected, actual) + + def test_series_count_invalid_param(): - s = cudf.Series([]) + s = cudf.Series([], dtype="float64") with pytest.raises(TypeError): s.count(skipna=True) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 463cdb8a7f4..3ac605a1a4d 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -10,7 +10,8 @@ import cudf from cudf.datasets import randomdata from cudf.testing._utils import ( - _create_pandas_series, + _create_cudf_series_float64_default, + _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -222,8 +223,8 @@ def test_approx_quantiles_int(): ) def test_misc_quantiles(data, q): - pdf_series = _create_pandas_series(data) - gdf_series = cudf.Series(data) + pdf_series = _create_pandas_series_float64_default(data) + gdf_series = _create_cudf_series_float64_default(data) expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) actual = gdf_series.quantile(q) @@ -242,7 +243,7 @@ def test_misc_quantiles(data, q): [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -292,7 +293,7 @@ def test_kurt_skew_error(op): [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -348,7 +349,7 @@ def test_series_median(dtype, num_na): np.zeros(100), np.array([1.123, 2.343, np.nan, 0.0]), np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -376,7 +377,7 @@ def test_series_pct_change(data, periods, fill_method): np.array([1.123, 2.343, np.nan, 0.0]), cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), cudf.Series([1.1, 2.32, 43.4], index=[0, 4, 3]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -420,7 +421,7 @@ def test_cov1d(data1, data2): np.array([1.123, 2.343, np.nan, 0.0]), cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), cudf.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -524,14 +525,14 @@ def test_df_corr(method): ) @pytest.mark.parametrize("skipna", [True, False]) def test_nans_stats(data, ops, skipna): - psr = _create_pandas_series(data) - gsr = cudf.Series(data, nan_as_null=False) + psr = _create_pandas_series_float64_default(data) + gsr = _create_cudf_series_float64_default(data, nan_as_null=False) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - gsr = cudf.Series(data, nan_as_null=False) + gsr = _create_cudf_series_float64_default(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only # testing for `skipna=True` when `nan_as_null=False` diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index a3593e55b97..ce6dc587320 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -150,9 +150,7 @@ def test_struct_setitem(data, item): "data", [ {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]}, - {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA}, {"a": "Hello"}, - {"b": [], "c": [1, 2, 3]}, ], ) def test_struct_scalar_host_construction(data): @@ -161,6 +159,39 @@ def test_struct_scalar_host_construction(data): assert list(slr.device_value.value.values()) == list(data.values()) +@pytest.mark.parametrize( + ("data", "dtype"), + [ + ( + {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA}, + cudf.StructDtype( + { + "a": np.dtype(np.int64), + "b": np.dtype(np.str_), + "c": cudf.ListDtype(np.dtype(np.int64)), + "d": np.dtype(np.int64), + } + ), + ), + ( + {"b": [], "c": [1, 2, 3]}, + cudf.StructDtype( + { + "b": cudf.ListDtype(np.dtype(np.int64)), + "c": cudf.ListDtype(np.dtype(np.int64)), + } + ), + ), + ], +) +def test_struct_scalar_host_construction_no_dtype_inference(data, dtype): + # cudf cannot infer the dtype of the scalar when it contains only nulls or + # is empty. + slr = cudf.Scalar(data, dtype=dtype) + assert slr.value == data + assert list(slr.device_value.value.values()) == list(data.values()) + + def test_struct_scalar_null(): slr = cudf.Scalar(cudf.NA, dtype=StructDtype) assert slr.device_value.value is cudf.NA diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1b94db75340..73ea8e2cfc4 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -463,24 +463,6 @@ def _get_nan_for_dtype(dtype): return np.float64("nan") -def _decimal_to_int64(decimal: Decimal) -> int: - """ - Scale a Decimal such that the result is the integer - that would result from removing the decimal point. - - Examples - -------- - >>> _decimal_to_int64(Decimal('1.42')) - 142 - >>> _decimal_to_int64(Decimal('0.0042')) - 42 - >>> _decimal_to_int64(Decimal('-1.004201')) - -1004201 - - """ - return int(f"{decimal:0f}".replace(".", "")) - - def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): error = TypeError( f"{op} not supported between {dtype_l} and {dtype_r} scalars" diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 574769f68d1..085d78afc7c 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.26.4", "cython>=3.0.0", "ninja", - "numpy>=1.21", + "numpy>=1.21,<1.25", "protoc-wheel", "pyarrow==12.0.1.*", "rmm==23.10.*", @@ -31,8 +31,8 @@ dependencies = [ "cuda-python>=11.7.1,<12.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numba>=0.57", - "numpy>=1.21", + "numba>=0.57,<0.58", + "numpy>=1.21,<1.25", "nvtx>=0.2.1", "packaging", "pandas>=1.3,<1.6.0dev0", diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index a6ef867451b..386cdc32ab1 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "cython>=3.0.0", - "numpy>=1.21", + "numpy>=1.21,<1.25", "pyarrow==12.0.1.*", "setuptools", "wheel", diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index e3f4f04eb85..344b03c631d 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -373,22 +373,37 @@ def percentile_cudf(a, q, interpolation="linear"): @pyarrow_schema_dispatch.register((cudf.DataFrame,)) -def _get_pyarrow_schema_cudf(obj, preserve_index=True, **kwargs): +def _get_pyarrow_schema_cudf(obj, preserve_index=None, **kwargs): if kwargs: warnings.warn( "Ignoring the following arguments to " f"`pyarrow_schema_dispatch`: {list(kwargs)}" ) - return meta_nonempty(obj).to_arrow(preserve_index=preserve_index).schema + + return _cudf_to_table( + meta_nonempty(obj), preserve_index=preserve_index + ).schema @to_pyarrow_table_dispatch.register(cudf.DataFrame) -def _cudf_to_table(obj, preserve_index=True, **kwargs): +def _cudf_to_table(obj, preserve_index=None, **kwargs): if kwargs: warnings.warn( "Ignoring the following arguments to " f"`to_pyarrow_table_dispatch`: {list(kwargs)}" ) + + # TODO: Remove this logic when cudf#14159 is resolved + # (see: https://github.com/rapidsai/cudf/issues/14159) + if preserve_index and isinstance(obj.index, cudf.RangeIndex): + obj = obj.copy() + obj.index.name = ( + obj.index.name + if obj.index.name is not None + else "__index_level_0__" + ) + obj.index = obj.index._as_int_index() + return obj.to_arrow(preserve_index=preserve_index) @@ -401,7 +416,15 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs): f"Ignoring the following arguments to " f"`from_pyarrow_table_dispatch`: {list(kwargs)}" ) - return obj.from_arrow(table) + result = obj.from_arrow(table) + + # TODO: Remove this logic when cudf#14159 is resolved + # (see: https://github.com/rapidsai/cudf/issues/14159) + if "__index_level_0__" in result.index.names: + assert len(result.index.names) == 1 + result.index.name = None + + return result @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index e841f2d8830..d6c9c1be73c 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -6,7 +6,7 @@ import numpy as np import tlz as toolz -import dask +from dask import config from dask.base import tokenize from dask.dataframe import methods from dask.dataframe.core import DataFrame, Index, Series @@ -18,6 +18,8 @@ from cudf.api.types import is_categorical_dtype from cudf.utils.utils import _dask_cudf_nvtx_annotate +_SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported + @_dask_cudf_nvtx_annotate def set_index_post(df, index_name, drop, column_dtype): @@ -307,15 +309,25 @@ def sort_values( return df4 +def get_default_shuffle_method(): + # Note that `dask.utils.get_default_shuffle_method` + # will return "p2p" by default when a distributed + # client is present. Dask-cudf supports "p2p", but + # will not use it by default (yet) + default = config.get("dataframe.shuffle.method", "tasks") + if default not in _SHUFFLE_SUPPORT: + default = "tasks" + return default + + def _get_shuffle_type(shuffle): # Utility to set the shuffle-kwarg default - # and to validate user-specified options. - # The only supported options is currently "tasks" - shuffle = shuffle or dask.config.get("shuffle", "tasks") - if shuffle != "tasks": + # and to validate user-specified options + shuffle = shuffle or get_default_shuffle_method() + if shuffle not in _SHUFFLE_SUPPORT: raise ValueError( - f"Dask-cudf only supports in-memory shuffling with " - f"'tasks'. Got shuffle={shuffle}" + "Dask-cudf only supports the following shuffle " + f"methods: {_SHUFFLE_SUPPORT}. Got shuffle={shuffle}" ) return shuffle diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index cf49b1df4f4..c64e25fd437 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -22,18 +22,25 @@ def test_is_categorical_dispatch(): assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category")) -def test_pyarrow_conversion_dispatch(): +@pytest.mark.parametrize("preserve_index", [True, False]) +def test_pyarrow_conversion_dispatch(preserve_index): from dask.dataframe.dispatch import ( from_pyarrow_table_dispatch, to_pyarrow_table_dispatch, ) df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc")) - df2 = from_pyarrow_table_dispatch(df1, to_pyarrow_table_dispatch(df1)) + df2 = from_pyarrow_table_dispatch( + df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index) + ) assert type(df1) == type(df2) assert_eq(df1, df2) + # Check that preserve_index does not produce a RangeIndex + if preserve_index: + assert not isinstance(df2.index, cudf.RangeIndex) + @pytest.mark.parametrize("index", [None, [1, 2] * 5]) def test_deterministic_tokenize(index): diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index e24feaa2ea4..db3f3695648 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numba.cuda import pytest @@ -77,3 +77,23 @@ def test_str_series_roundtrip(): actual = dask_series.compute() assert_eq(actual, expected) + + +def test_p2p_shuffle(): + # Check that we can use `shuffle="p2p"` + with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: + with Client(cluster): + ddf = ( + dask.datasets.timeseries( + start="2000-01-01", + end="2000-01-08", + dtypes={"x": int}, + ) + .reset_index(drop=True) + .to_backend("cudf") + ) + dd.assert_eq( + ddf.sort_values("x", shuffle="p2p").compute(), + ddf.compute().sort_values("x"), + check_index=False, + ) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 2464abca71a..922da366422 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "dask>=2023.7.1", "distributed>=2023.7.1", "fsspec>=0.6.0", - "numpy>=1.21", + "numpy>=1.21,<1.25", "pandas>=1.3,<1.6.0dev0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -40,7 +40,7 @@ dynamic = ["entry-points"] [project.optional-dependencies] test = [ "dask-cuda==23.10.*", - "numba>=0.57", + "numba>=0.57,<0.58", "pytest", "pytest-cov", "pytest-xdist",