diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index af8d1289ea1..6f0e88fb245 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -62,3 +62,33 @@ jobs: UPDATE_ITEM: true UPDATE_LINKED_ISSUES: true secrets: inherit + + process-branch-name: + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: get-project-id + runs-on: ubuntu-latest + outputs: + branch-name: ${{ steps.process-branch-name.outputs.branch-name }} + steps: + - name: Extract branch name + id: process-branch-name + run: | + branch=${{ github.event.pull_request.base.ref }} + release=${branch#branch-} + echo "branch-name=$release" >> "$GITHUB_OUTPUT" + + update-release: + # This job sets the PR and its linked issues to the release they are targeting + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12 + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: [get-project-id, process-branch-name] + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ" + SINGLE_SELECT_FIELD_NAME: "Release" + SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index 8975381ceba..91bc071583e 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -5,11 +5,15 @@ set -euo pipefail package_dir="python/libcudf" +export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" ./ci/build_wheel.sh ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" mkdir -p ${package_dir}/final_dist -python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* +python -m auditwheel repair \ + --exclude libnvcomp.so.4 \ + -w ${package_dir}/final_dist \ + ${package_dir}/dist/* RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 55399d0371a..f5bcdc62604 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -24,14 +24,17 @@ rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -# Download the pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep +# Download libcudf and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep -rapids-logger "Install pylibcudf" -python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl +rapids-logger "Install libcudf, pylibcudf and cudf_polars" +python -m pip install \ + -v \ + "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ + "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" -rapids-logger "Install cudf_polars" -python -m pip install $(echo ./dist/cudf_polars*.whl) TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') rapids-logger "Clone polars to ${TAG}" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index 2386414b32e..9528549a562 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi - +rapids-print-env EXITCODE=0 trap "EXITCODE=1" ERR set +e diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8b45d26c367..bd5e6c3d569 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -63,7 +63,7 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc - polars>=1.8,<1.9 - pre-commit diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 354c1360e5a..565a3ebfa3c 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -61,7 +61,7 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc - polars>=1.8,<1.9 - pre-commit diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 25e69b89789..2c254415318 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,7 +78,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - numba-cuda >=0.0.13 - numpy >=1.23,<3.0a0 diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 7c1efa0176c..3d965f30986 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -77,7 +77,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index d766d98b45e..2d4f8c0d80e 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -1,17 +1,47 @@ --- +# Notes on disabled checks +# ------------------------ +# modernize-use-equals-default: +# auto-fix is broken (doesn't insert =default correctly) +# modernize-concat-nested-namespaces: +# auto-fix is broken (can delete code) +# modernize-use-trailing-return-type: +# Purely stylistic, no benefit to rewriting everything +# modernize-return-braced-init-list: +# Stylistically we prefer to see the return type at the return site. +# See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672 +# for more information. +# modernize-use-bool-literals: +# Our tests use int flags for validity masks extensively and we prefer that +# clang-analyzer-cplusplus.NewDeleteLeaks: +# This check has numerous bugs, see +# https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks +# We encounter at least +# https://github.com/llvm/llvm-project/issues/60896 +# https://github.com/llvm/llvm-project/issues/69602 +# clang-analyzer-optin.core.EnumCastOutOfRange +# We use enums as flags in multiple cases and this check makes ORing flags invalid +# clang-analyzer-optin.cplusplus.UninitializedObject' +# There is an error in nanoarrow that none of the clang-tidy filters (i.e. +# header-filter and exclude-header-filter are able to properly avoid. This +# merits further investigation +# +# We need to verify that broken checks are still broken Checks: 'modernize-*, -modernize-use-equals-default, -modernize-concat-nested-namespaces, - -modernize-use-trailing-return-type' - - # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) - # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) - # -modernize-use-trailing-return-type # just a preference + -modernize-use-trailing-return-type, + -modernize-return-braced-init-list, + -modernize-use-bool-literals, + clang-analyzer-*, + -clang-analyzer-cplusplus.NewDeleteLeaks, + -clang-analyzer-optin.core.EnumCastOutOfRange, + -clang-analyzer-optin.cplusplus.UninitializedObject' WarningsAsErrors: '' -HeaderFilterRegex: '' -AnalyzeTemporaryDtors: false +HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*' +ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none CheckOptions: - key: modernize-loop-convert.MaxCopySize diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 136f43ee706..f7a5dd2f2fb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) +option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) option( diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..b8a53cd8bd9 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- @@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) -# ################################################################################################## -# * multi buffer memset benchmark -# ---------------------------------------------------------------------- -ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp) - # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 65a44532cf1..f44f26e4d2c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,16 @@ */ #include -#include -#include #include #include +#include + #include +#include + #include #include #include @@ -35,13 +37,10 @@ enum class TreeType { }; template -class AST : public cudf::benchmark {}; - -template -static void BM_ast_transform(benchmark::State& state) +static void BM_ast_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size = static_cast(state.get_int64("table_size")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state) auto const& expression_tree_root = expressions.back(); - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::compute_column(table, expression_tree_root); - } - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); -} + state.add_global_memory_reads(table_size * (tree_levels + 1)); -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) \ + static void name(::nvbench::state& st) \ { \ - BM_ast_transform(st); \ + ::BM_ast_transform(st); \ } \ - BENCHMARK_REGISTER_F(AST, name) \ - ->Apply(CustomRanges) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("tree_levels", {1, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index fa98d9e601a..7d267a88764 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,14 @@ */ #include -#include -#include #include #include #include +#include + #include -#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -33,13 +32,10 @@ enum class TreeType { }; template -class BINARYOP : public cudf::benchmark {}; - -template -static void BM_binaryop_transform(benchmark::State& state) +static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size{static_cast(state.get_int64("table_size"))}; + auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state) cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); cudf::table_view table{*source_table}; - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + // Use the number of bytes read from global memory + state.add_global_memory_reads(table_size * (tree_levels + 1)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) auto const op = cudf::binary_operator::ADD; auto const result_data_type = cudf::data_type(cudf::type_to_id()); @@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state) result = cudf::binary_operation(result->view(), col, op, result_data_type); }); } - } - - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); + }); } #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \ - (::benchmark::State & st) { BM_binaryop_transform(st); } + \ + static void name(::nvbench::state& st) \ + { \ + BM_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false); - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 2, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 7086a61c7c5..bc0ff69bce9 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -15,20 +15,18 @@ */ #include -#include -#include #include -class COMPILED_BINARYOP : public cudf::benchmark {}; +#include template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const column_size{static_cast(state.range(0))}; + auto const table_size = static_cast(state.get_int64("table_size")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{column_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{table_size}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) // Call once for hot cache. cudf::binary_operation(lhs, rhs, binop, output_dtype); - for (auto _ : state) { - cuda_event_timer timer(state, true); - cudf::binary_operation(lhs, rhs, binop, output_dtype); - } - // use number of bytes read and written to global memory - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut))); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); } +#define BM_STRINGIFY(a) #a + // TODO tparam boolean for null. -#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ - BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::bop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ - ->Arg(100000000); /* 100M */ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_compiled_binaryop(st, ::cudf::binary_operator::bop); \ + } \ + NVBENCH_BENCH(name) \ + .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ + .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp deleted file mode 100644 index 2905895a63b..00000000000 --- a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include - -#include - -// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to -// run on most GPUs, but large enough to allow highest throughput -constexpr size_t data_size = 512 << 20; - -void parquet_read_common(cudf::size_type num_rows_to_read, - cudf::size_type num_cols_to_read, - cuio_source_sink_pair& source_sink, - nvbench::state& state) -{ - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); - - auto mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); - - timer.start(); - auto const result = cudf::io::read_parquet(read_opts); - timer.stop(); - - CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns"); - CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); - }); - - auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); - state.add_buffer_size( - mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); - state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); -} - -template -void bench_batched_memset(nvbench::state& state, nvbench::type_list>) -{ - auto const d_type = get_type_or_group(static_cast(DataType)); - auto const num_cols = static_cast(state.get_int64("num_cols")); - auto const cardinality = static_cast(state.get_int64("cardinality")); - auto const run_length = static_cast(state.get_int64("run_length")); - auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); - auto const compression = cudf::io::compression_type::NONE; - cuio_source_sink_pair source_sink(source_type); - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - cudf::io::write_parquet(write_opts); - auto const num_rows = view.num_rows(); - - parquet_read_common(num_rows, num_cols, source_sink, state); -} - -using d_type_list = nvbench::enum_type_list; - -NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list)) - .set_name("batched_memset") - .set_type_axes_names({"data_type"}) - .add_int64_axis("num_cols", {1000}) - .add_string_axis("io_type", {"DEVICE_BUFFER"}) - .set_min_samples(4) - .add_int64_axis("cardinality", {0, 1000}) - .add_int64_axis("run_length", {1, 32}); diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 8df1b431095..d7d7fcca044 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,15 +14,17 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) + include(${rapids-cmake-dir}/cpm/package_override.cmake) + + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") + # Currently we need to always build nanoarrow so we don't pickup a previous installed version set(CPM_DOWNLOAD_nanoarrow ON) rapids_cpm_find( nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS - GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git - GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb - GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 41bbf44abc8..33b1b45fb44 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,11 +16,11 @@ function(find_and_configure_nvcomp) include(${rapids-cmake-dir}/cpm/nvcomp.cmake) - rapids_cpm_nvcomp( - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP} - ) + set(export_args) + if(CUDF_EXPORT_NVCOMP) + set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + endif() + rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) # Per-thread default stream if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff new file mode 100644 index 00000000000..e9a36fcb567 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff @@ -0,0 +1,38 @@ +diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h +index caa6be4..70ec8a2 100644 +--- a/src/nanoarrow/common/inline_buffer.h ++++ b/src/nanoarrow/common/inline_buffer.h +@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + } + + static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { +- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ++ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); +- bits[bytes_begin] &= only_byte_mask; ++ bits[bytes_begin] &= only_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte +- bits[bytes_begin] &= first_byte_mask; ++ bits[bytes_begin] &= first_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { +@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte +- *out_cursor = 0x00; ++ *out_cursor = 0x00; // NOLINT + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json new file mode 100644 index 00000000000..d529787e7c8 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json @@ -0,0 +1,18 @@ + +{ + "packages" : { + "nanoarrow" : { + "version" : "0.6.0.dev", + "git_url" : "https://github.com/apache/arrow-nanoarrow.git", + "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", + "git_shallow" : false, + "patches" : [ + { + "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", + "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 6d1c91a5752..6902b1948bd 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -8,6 +8,7 @@ This page specifies which regular expression (regex) features are currently supp - cudf::strings::extract() - cudf::strings::extract_all_record() - cudf::strings::findall() +- cudf::strings::find_re() - cudf::strings::replace_re() - cudf::strings::replace_with_backrefs() - cudf::strings::split_re() diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index c3238cb94fd..35a39ef9758 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1425,13 +1425,13 @@ struct pair_rep_accessor { private: template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 3ef7bafe727..48f89b8be25 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -235,7 +235,7 @@ class column_view_base { * * @return Typed pointer to underlying data */ - virtual void const* get_data() const noexcept { return _data; } + [[nodiscard]] virtual void const* get_data() const noexcept { return _data; } data_type _type{type_id::EMPTY}; ///< Element type size_type _size{}; ///< Number of elements @@ -695,7 +695,7 @@ class mutable_column_view : public detail::column_view_base { * * @return Typed pointer to underlying data */ - void const* get_data() const noexcept override; + [[nodiscard]] void const* get_data() const noexcept override; private: friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type); diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 7359a0d5fde..1eaea5b6374 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -38,6 +38,22 @@ namespace datetime { * @file */ +/** + * @brief Types of datetime components that may be extracted. + */ +enum class datetime_component : uint8_t { + YEAR, + MONTH, + DAY, + WEEKDAY, + HOUR, + MINUTE, + SECOND, + MILLISECOND, + MICROSECOND, + NANOSECOND +}; + /** * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. @@ -207,6 +223,24 @@ std::unique_ptr extract_nanosecond_fraction( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Extracts the specified datetime component from any datetime type and + * returns an int16_t cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param component The datetime component to extract + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t datetime component + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_datetime_component( + cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group /** * @addtogroup datetime_compute diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index ecf2f610697..de53e7586cd 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -18,11 +18,11 @@ #include #include +#include #include #include #include -#include -#include +#include #include #include @@ -30,8 +30,17 @@ #include +#include +#include + namespace cudf { namespace detail { +template +constexpr bool is_product_supported() +{ + return is_numeric(); +} + /** * @brief Maps an `aggregation::Kind` value to it's corresponding binary * operator. @@ -113,465 +122,6 @@ constexpr bool has_corresponding_operator() return !std::is_same_v::type, void>; } -template -struct update_target_element { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - CUDF_UNREACHABLE("Invalid source type and aggregation combination."); - } -}; - -template -struct update_target_element< - Source, - aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -/** - * @brief Function object to update a single element in a target column using - * the dictionary key addressed by the specific index. - * - * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a - * dictionary. - * - */ -template -struct update_target_from_dictionary { - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - update_target_element{}( - target, target_index, source, source_index); - } - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - } -}; - -/** - * @brief Specialization function for dictionary type and aggregations. - * - * The `source` column is a dictionary type. This functor de-references the - * dictionary's keys child column and maps the input source index through - * the dictionary's indices child column to pass to the `update_target_element` - * in the above `update_target_from_dictionary` using the type-dispatcher to - * resolve the keys column type. - * - * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` - * - * @tparam target_has_nulls Indicates presence of null elements in `target` - * @tparam source_has_nulls Indicates presence of null elements in `source`. - */ -template -struct update_target_element< - dictionary32, - k, - target_has_nulls, - source_has_nulls, - std::enable_if_t> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - dispatch_type_and_aggregation( - source.child(cudf::dictionary_column_view::keys_column_index).type(), - k, - update_target_from_dictionary{}, - target, - target_index, - source.child(cudf::dictionary_column_view::keys_column_index), - static_cast(source.element(source_index))); - } -}; - -template -constexpr bool is_product_supported() -{ - return is_numeric(); -} - -template -struct update_target_element()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto value = static_cast(source.element(source_index)); - cudf::detail::atomic_add(&target.element(target_index), value * value); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_mul(&target.element(target_index), - static_cast(source.element(source_index))); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::COUNT_VALID, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), Target{1}); - - // It is assumed the output for COUNT_VALID is initialized to be all valid - } -}; - -template -struct update_target_element< - Source, - aggregation::COUNT_ALL, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), Target{1}); - - // It is assumed the output for COUNT_ALL is initialized to be all valid - } -}; - -template -struct update_target_element< - Source, - aggregation::ARGMAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), ARGMAX_SENTINEL, source_index); - if (old != ARGMAX_SENTINEL) { - while (source.element(source_index) > source.element(old)) { - old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::ARGMIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), ARGMIN_SENTINEL, source_index); - if (old != ARGMIN_SENTINEL) { - while (source.element(source_index) < source.element(old)) { - old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -/** - * @brief Function object to update a single element in a target column by - * performing an aggregation operation with a single element from a source - * column. - * - * @tparam target_has_nulls Indicates presence of null elements in `target` - * @tparam source_has_nulls Indicates presence of null elements in `source`. - */ -template -struct elementwise_aggregator { - template - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - update_target_element{}( - target, target_index, source, source_index); - } -}; - -/** - * @brief Updates a row in `target` by performing elementwise aggregation - * operations with a row in `source`. - * - * For the row in `target` specified by `target_index`, each element at `i` is - * updated by: - * ```c++ - * target_row[i] = aggs[i](target_row[i], source_row[i]) - * ``` - * - * This function only supports aggregations that can be done in a "single pass", - * i.e., given an initial value `R`, the aggregation `op` can be computed on a series - * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order - * of the values of `i`. - * - * The initial value and validity of `R` depends on the aggregation: - * SUM: 0 and NULL - * MIN: Max value of type and NULL - * MAX: Min value of type and NULL - * COUNT_VALID: 0 and VALID - * COUNT_ALL: 0 and VALID - * ARGMAX: `ARGMAX_SENTINEL` and NULL - * ARGMIN: `ARGMIN_SENTINEL` and NULL - * - * It is required that the elements of `target` be initialized with the corresponding - * initial values and validity specified above. - * - * Handling of null elements in both `source` and `target` depends on the aggregation: - * SUM, MIN, MAX, ARGMIN, ARGMAX: - * - `source`: Skipped - * - `target`: Updated from null to valid upon first successful aggregation - * COUNT_VALID, COUNT_ALL: - * - `source`: Skipped - * - `target`: Cannot be null - * - * @param target Table containing the row to update - * @param target_index Index of the row to update in `target` - * @param source Table containing the row used to update the row in `target`. - * The invariant `source.num_columns() >= target.num_columns()` must hold. - * @param source_index Index of the row to use in `source` - * @param aggs Array of aggregations to perform between elements of the `target` - * and `source` rows. Must contain at least `target.num_columns()` valid - * `aggregation::Kind` values. - */ -template -__device__ inline void aggregate_row(mutable_table_device_view target, - size_type target_index, - table_device_view source, - size_type source_index, - aggregation::Kind const* aggs) -{ - for (auto i = 0; i < target.num_columns(); ++i) { - dispatch_type_and_aggregation(source.column(i).type(), - aggs[i], - elementwise_aggregator{}, - target.column(i), - target_index, - source.column(i), - source_index); - } -} - /** * @brief Dispatched functor to initialize a column with the identity of an * aggregation operation. diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 4255faea702..6661a461b8b 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -683,7 +683,7 @@ class ewma_aggregation final : public scan_aggregation { { } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -694,7 +694,7 @@ class ewma_aggregation final : public scan_aggregation { return collector.visit(col_type, *this); } - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh new file mode 100644 index 00000000000..10be5e1d36f --- /dev/null +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::detail { +template +struct update_target_element { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element< + Source, + aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MIN, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MAX, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::SUM, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief Function object to update a single element in a target column using + * the dictionary key addressed by the specific index. + * + * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a + * dictionary. + * + */ +struct update_target_from_dictionary { + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + update_target_element{}(target, target_index, source, source_index); + } + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + } +}; + +/** + * @brief Specialization function for dictionary type and aggregations. + * + * The `source` column is a dictionary type. This functor de-references the + * dictionary's keys child column and maps the input source index through + * the dictionary's indices child column to pass to the `update_target_element` + * in the above `update_target_from_dictionary` using the type-dispatcher to + * resolve the keys column type. + * + * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` + */ +template +struct update_target_element< + dictionary32, + k, + cuda::std::enable_if_t> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + dispatch_type_and_aggregation( + source.child(cudf::dictionary_column_view::keys_column_index).type(), + k, + update_target_from_dictionary{}, + target, + target_index, + source.child(cudf::dictionary_column_view::keys_column_index), + static_cast(source.element(source_index))); + } +}; + +template +struct update_target_element()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target.element(target_index), value * value); + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source.element(source_index))); + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::COUNT_VALID, + cuda::std::enable_if_t()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), Target{1}); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element< + Source, + aggregation::COUNT_ALL, + cuda::std::enable_if_t()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), Target{1}); + + // It is assumed the output for COUNT_ALL is initialized to be all valid + } +}; + +template +struct update_target_element< + Source, + aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMAX_SENTINEL, source_index); + if (old != ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMIN_SENTINEL, source_index); + if (old != ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief Function object to update a single element in a target column by + * performing an aggregation operation with a single element from a source + * column. + */ +struct elementwise_aggregator { + template + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + update_target_element{}(target, target_index, source, source_index); + } +}; + +/** + * @brief Updates a row in `target` by performing elementwise aggregation + * operations with a row in `source`. + * + * For the row in `target` specified by `target_index`, each element at `i` is + * updated by: + * ```c++ + * target_row[i] = aggs[i](target_row[i], source_row[i]) + * ``` + * + * This function only supports aggregations that can be done in a "single pass", + * i.e., given an initial value `R`, the aggregation `op` can be computed on a series + * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order + * of the values of `i`. + * + * The initial value and validity of `R` depends on the aggregation: + * SUM: 0 and NULL + * MIN: Max value of type and NULL + * MAX: Min value of type and NULL + * COUNT_VALID: 0 and VALID + * COUNT_ALL: 0 and VALID + * ARGMAX: `ARGMAX_SENTINEL` and NULL + * ARGMIN: `ARGMIN_SENTINEL` and NULL + * + * It is required that the elements of `target` be initialized with the corresponding + * initial values and validity specified above. + * + * Handling of null elements in both `source` and `target` depends on the aggregation: + * SUM, MIN, MAX, ARGMIN, ARGMAX: + * - `source`: Skipped + * - `target`: Updated from null to valid upon first successful aggregation + * COUNT_VALID, COUNT_ALL: + * - `source`: Skipped + * - `target`: Cannot be null + * + * @param target Table containing the row to update + * @param target_index Index of the row to update in `target` + * @param source Table containing the row used to update the row in `target`. + * The invariant `source.num_columns() >= target.num_columns()` must hold. + * @param source_index Index of the row to use in `source` + * @param aggs Array of aggregations to perform between elements of the `target` + * and `source` rows. Must contain at least `target.num_columns()` valid + * `aggregation::Kind` values. + */ +__device__ inline void aggregate_row(mutable_table_device_view target, + size_type target_index, + table_device_view source, + size_type source_index, + aggregation::Kind const* aggs) +{ + for (auto i = 0; i < target.num_columns(); ++i) { + dispatch_type_and_aggregation(source.column(i).type(), + aggs[i], + elementwise_aggregator{}, + target.column(i), + target_index, + source.column(i), + source_index); + } +} +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 9db7e48498f..df3050d6494 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -115,6 +115,16 @@ std::unique_ptr extract_nanosecond_fraction(cudf::column_view cons rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @copydoc cudf::extract_datetime_component(cudf::column_view const&, datetime_component, + * rmm::cuda_stream_view, rmm::device_async_resource_ref) + * + */ +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index ce8783d8b79..d7a42d0eca5 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -211,7 +211,6 @@ struct sort_groupby_helper { */ column_view keys_bitmask_column(rmm::cuda_stream_view stream); - private: column_ptr _key_sorted_order; ///< Indices to produce _keys in sorted order column_ptr _unsorted_keys_labels; ///< Group labels for unsorted _keys column_ptr _keys_bitmask_column; ///< Column representing rows with one or more nulls values diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index 80a4460023f..4295f5e6ddd 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -143,28 +143,30 @@ std::unique_ptr make_tdigest_column(size_type num_rows, rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest column. + * @brief Create a tdigest column of empty tdigests. * - * An empty tdigest column contains a single row of length 0 + * The column created contains the specified number of rows of empty tdigests. * + * @param num_rows The number of rows in the output column. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest column. + * @returns A tdigest column of empty clusters. */ CUDF_EXPORT -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr make_empty_tdigests_column(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest scalar. + * @brief Create a scalar of an empty tdigest cluster. * - * An empty tdigest scalar is a struct_scalar that contains a single row of length 0 + * The returned scalar is a struct_scalar that contains a single row of an empty cluster. * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest scalar. + * @returns A scalar of an empty tdigest cluster. */ std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp new file mode 100644 index 00000000000..ed0ab9e6e5b --- /dev/null +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief A helper function that copies a vector of vectors from source to destination addresses in + * a batched manner. + * + * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator + * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator + * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator + * + * @param src_iter Device-accessible iterator to source addresses + * @param dst_iter Device-accessible iterator to destination addresses + * @param size_iter Device-accessible iterator to the buffer sizes (in bytes) + * @param num_buffs Number of buffers to be copied + * @param stream CUDA stream to use + */ +template +void batched_memcpy_async(SrcIterator src_iter, + DstIterator dst_iter, + SizeIterator size_iter, + size_t num_buffs, + rmm::cuda_stream_view stream) +{ + size_t temp_storage_bytes = 0; + cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value()); + + rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; + + cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_iter, + dst_iter, + size_iter, + num_buffs, + stream.value()); +} + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp similarity index 98% rename from cpp/include/cudf/io/detail/batched_memset.hpp rename to cpp/include/cudf/detail/utilities/batched_memset.hpp index 1c74be4a9fe..75f738f7529 100644 --- a/cpp/include/cudf/io/detail/batched_memset.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp @@ -28,7 +28,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::detail { +namespace detail { /** * @brief A helper function that takes in a vector of device spans and memsets them to the @@ -78,5 +78,5 @@ void batched_memset(std::vector> const& bufs, d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream); } -} // namespace io::detail +} // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 632d5a732ec..4f0c52c5954 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -25,33 +26,82 @@ namespace detail { enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; +void cuda_memcpy_async_impl( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + /** - * @brief Asynchronously copies data between the host and device. + * @brief Asynchronously copies data from host to device memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination device memory + * @param src Source host memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy_async( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = src.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} /** - * @brief Synchronously copies data between the host and device. + * @brief Asynchronously copies data from device to host memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination host memory + * @param src Source device memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = dst.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} + +/** + * @brief Synchronously copies data from host to device memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination device memory + * @param src Source host memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} + +/** + * @brief Synchronously copies data from device to host memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination host memory + * @param src Source device memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index ecb8f910463..3f6ad7b7b1d 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -183,7 +183,7 @@ class rmm_host_allocator { */ inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } - bool is_device_accessible() const { return _is_device_accessible; } + [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; } private: rmm::host_async_resource_ref mr; diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp index 8c1c3c28df8..e7643eb44bd 100644 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ b/cpp/include/cudf/detail/utilities/logger.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ #include // Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__) +#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 953ae5b9308..1f1e7a2db77 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -101,12 +101,7 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - auto const is_pinned = source_data.is_device_accessible(); - cuda_memcpy_async(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async(ret, source_data, stream); return ret; } @@ -405,13 +400,8 @@ host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str template host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - auto result = make_host_vector(v.size(), stream); - auto const is_pinned = result.get_allocator().is_device_accessible(); - cuda_memcpy_async(result.data(), - v.data(), - v.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + auto result = make_host_vector(v.size(), stream); + cuda_memcpy_async(result, v, stream); return result; } diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index dc822fee38b..5596f78a90b 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -47,7 +47,7 @@ class dictionary_column_view : private column_view { dictionary_column_view(column_view const& dictionary_column); dictionary_column_view(dictionary_column_view&&) = default; ///< Move constructor dictionary_column_view(dictionary_column_view const&) = default; ///< Copy constructor - ~dictionary_column_view() = default; + ~dictionary_column_view() override = default; /** * @brief Move assignment operator diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 11c778408fe..c9df02f167a 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -36,7 +36,7 @@ namespace CUDF_EXPORT cudf { namespace groupby { namespace detail { namespace sort { -class sort_groupby_helper; +struct sort_groupby_helper; } // namespace sort } // namespace detail diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index b12fbe39a57..dc14802adc1 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -86,14 +86,28 @@ class datasource { /** * @brief Creates a source from a file path. * + * @note Parameters `offset`, `max_size_estimate` and `min_size_estimate` are hints to the + * `datasource` implementation about the expected range of the data that will be read. The + * implementation may use these hints to optimize the read operation. These parameters are usually + * based on the byte range option. In this case, `min_size_estimate` should be no greater than the + * byte range to avoid potential issues when reading adjacent ranges. `max_size_estimate` can + * include padding after the byte range, to include additional data that may be needed for + * processing. + * + @throws cudf::logic_error if the minimum size estimate is greater than the maximum size estimate + * * @param[in] filepath Path to the file to use - * @param[in] offset Bytes from the start of the file (the default is zero) - * @param[in] size Bytes from the offset; use zero for entire file (the default is zero) + * @param[in] offset Starting byte offset from which data will be read (the default is zero) + * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) + * @param[in] min_size_estimate Lower estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) * @return Constructed datasource object */ static std::unique_ptr create(std::string const& filepath, - size_t offset = 0, - size_t size = 0); + size_t offset = 0, + size_t max_size_estimate = 0, + size_t min_size_estimate = 0); /** * @brief Creates a source from a host memory buffer. diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 6798557e14e..b662b660557 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -116,9 +116,6 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; - // Whether to use the legacy reader - bool _legacy = false; - // Whether to keep the quote characters of string values bool _keep_quotes = false; diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ee03a382bec..bfe76d5690c 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -1200,7 +1200,7 @@ class parquet_writer_options : public parquet_writer_options_base { * @param sink The sink used for writer output * @param table Table to be written to output */ - explicit parquet_writer_options(sink_info const& sink, table_view const& table); + explicit parquet_writer_options(sink_info const& sink, table_view table); public: /** diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index b117a871b64..d7057cfea7e 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -48,7 +48,7 @@ class lists_column_view : private column_view { lists_column_view(column_view const& lists_column); lists_column_view(lists_column_view&&) = default; ///< Move constructor lists_column_view(lists_column_view const&) = default; ///< Copy constructor - ~lists_column_view() = default; + ~lists_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh index 4cf8564ab3a..5694362af8f 100644 --- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh +++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh @@ -31,17 +31,41 @@ namespace detail { // intermediate data structure to compute `var`, `std` template struct var_std { - ResultType value; /// the value - ResultType value_squared; /// the value of squared - - CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0) - : value(_value), value_squared(_value_squared){}; + // Uses the pairwise approach of Chan, Golub, and LeVeque, + // _Algorithms for computing the sample variance: analysis and + // recommendations_ (1983) + // https://doi.org/10.1080/00031305.1983.10483115 + // Also http://www.cs.yale.edu/publications/techreports/tr222.pdf + // This is a modification of Youngs and Cramer's online approach. + ResultType running_sum; + ResultType running_square_deviations; + size_type count; + + CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0) + : running_sum(t), running_square_deviations(s), count(n){}; using this_t = var_std; CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const { - return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared)); + // Updates as per equations 1.5a and 1.5b in the paper + // T_{1,m+n} = T_{1,m} + T_{m+1,n+1} + // S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2 + // Here the first m samples are in this, the remaining n samples are in rhs. + auto const m = this->count; + auto const n = rhs.count; + // Avoid division by zero. + if (m == 0) { return rhs; } + if (n == 0) { return *this; } + auto const tm = this->running_sum; + auto const tn = rhs.running_sum; + auto const sm = this->running_square_deviations; + auto const sn = rhs.running_square_deviations; + auto const tmn = tm + tn; + auto const diff = ((static_cast(n) / m) * tm) - tn; + // Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow + auto const smn = sm + sn + ((static_cast(m) / n) / (m + n)) * diff * diff; + return {tmn, smn, m + n}; }; }; @@ -50,10 +74,7 @@ template struct transformer_var_std { using OutputType = var_std; - CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) - { - return OutputType(value, value * value); - }; + CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; }; }; // ------------------------------------------------------------------------ @@ -257,12 +278,7 @@ struct variance : public compound_op { cudf::size_type const& count, cudf::size_type const& ddof) { - ResultType mean = input.value / count; - ResultType asum = input.value_squared; - cudf::size_type div = count - ddof; - ResultType var = asum / div - ((mean * mean) * count) / div; - - return var; + return input.running_square_deviations / (count - ddof); }; }; }; diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index e8a498afc09..66be2a12fbe 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf { */ class scalar { public: + scalar() = delete; virtual ~scalar() = default; scalar& operator=(scalar const& other) = delete; scalar& operator=(scalar&& other) = delete; @@ -96,8 +97,6 @@ class scalar { data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar rmm::device_scalar _is_valid; ///< Device bool signifying validity - scalar() = delete; - /** * @brief Move constructor for scalar. * @param other The other scalar to move from. @@ -145,6 +144,7 @@ class fixed_width_scalar : public scalar { public: using value_type = T; ///< Type of the value held by the scalar. + fixed_width_scalar() = delete; ~fixed_width_scalar() override = default; /** @@ -203,8 +203,6 @@ class fixed_width_scalar : public scalar { protected: rmm::device_scalar _data; ///< device memory containing the value - fixed_width_scalar() = delete; - /** * @brief Construct a new fixed width scalar object. * diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 3ebe5cb53e9..f229facca08 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -30,7 +30,7 @@ namespace strings { */ /** - * @brief Returns a boolean column identifying strings entries in which all + * @brief Returns a boolean column identifying string entries where all * characters are of the type specified. * * The output row entry will be set to false if the corresponding string element @@ -105,7 +105,8 @@ std::unique_ptr all_characters_of_type( * `types_to_remove` will be filtered. * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches - * @return New column of boolean results for each string + * @return New strings column with the characters of specified types filtered out and replaced by + * the specified replacement string */ std::unique_ptr filter_characters_of_type( strings_column_view const& input, diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 5d6aff28826..6460d4f43ff 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -74,9 +74,9 @@ character_cases_table_type const* get_character_cases_table(); */ struct special_case_mapping { uint16_t num_upper_chars; - uint16_t upper[3]; + uint16_t upper[3]; // NOLINT uint16_t num_lower_chars; - uint16_t lower[3]; + uint16_t lower[3]; // NOLINT }; /** diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index c6b9bc7e58a..867764b6d9a 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -66,6 +66,35 @@ std::unique_ptr findall( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the starting character index of the first match for the given pattern + * in each row of the input column + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * p = regex_program::create("[be]") + * r = find_re(s, p) + * r is now [0, 2, 3, -1] + * @endcode + * + * A null output row occurs if the corresponding input row is null. + * A -1 is returned for rows that do not contain a match. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input Strings instance for this operation + * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of integers + */ +std::unique_ptr find_re( + strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of doxygen group } // namespace strings } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index 9da859d9c87..1bf1c26f471 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -54,6 +54,8 @@ struct regex_program { regex_flags flags = regex_flags::DEFAULT, capture_groups capture = capture_groups::EXTRACT); + regex_program() = delete; + /** * @brief Move constructor * @@ -115,8 +117,6 @@ struct regex_program { ~regex_program(); private: - regex_program() = delete; - std::string _pattern; regex_flags _flags; capture_groups _capture; diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 14695c3bb27..34ed3c5618e 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -99,7 +99,7 @@ __device__ inline std::pair bytes_to_character_position(st * values. Also, this char pointer serves as valid device pointer of identity * value for minimum operator on string values. */ -static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; +static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; // NOLINT } // namespace detail } // namespace strings @@ -283,14 +283,11 @@ __device__ inline size_type string_view::const_iterator::position() const { retu __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; } -__device__ inline string_view::const_iterator string_view::begin() const -{ - return const_iterator(*this, 0, 0); -} +__device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; } __device__ inline string_view::const_iterator string_view::end() const { - return const_iterator(*this, length(), size_bytes()); + return {*this, length(), size_bytes()}; } // @endcond @@ -411,7 +408,7 @@ __device__ inline size_type string_view::find(char const* str, __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return find(str, chwidth, pos, count); } @@ -433,7 +430,7 @@ __device__ inline size_type string_view::rfind(char const* str, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return rfind(str, chwidth, pos, count); } diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 4a2512eb7c5..6ec8d1238d6 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -45,7 +45,7 @@ class strings_column_view : private column_view { strings_column_view(column_view strings_column); strings_column_view(strings_column_view&&) = default; ///< Move constructor strings_column_view(strings_column_view const&) = default; ///< Copy constructor - ~strings_column_view() = default; + ~strings_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index 19798f51656..91d7ddce955 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -42,7 +42,7 @@ class structs_column_view : public column_view { // Foundation members: structs_column_view(structs_column_view const&) = default; ///< Copy constructor structs_column_view(structs_column_view&&) = default; ///< Move constructor - ~structs_column_view() = default; + ~structs_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 762131a174f..15fdad21d9f 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -148,7 +148,7 @@ class table { std::vector columns(std::distance(begin, end)); std::transform( begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 4a990f67ce4..d41176590ea 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -241,7 +241,7 @@ class table_view : public detail::table_view_base { { std::vector columns(std::distance(begin, end)); std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp index 2f19efa5630..da4954b859c 100644 --- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp +++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp @@ -59,7 +59,7 @@ class tdigest_column_view : private column_view { tdigest_column_view(column_view const&); ///< Construct tdigest_column_view from a column_view tdigest_column_view(tdigest_column_view&&) = default; ///< Move constructor tdigest_column_view(tdigest_column_view const&) = default; ///< Copy constructor - ~tdigest_column_view() = default; + ~tdigest_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp index 45d5d1b12e1..982554a23f5 100644 --- a/cpp/include/cudf/utilities/logger.hpp +++ b/cpp/include/cudf/utilities/logger.hpp @@ -22,6 +22,10 @@ namespace CUDF_EXPORT cudf { +namespace detail { +spdlog::logger& logger(); +} + /** * @brief Returns the global logger. * @@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf { * * @return spdlog::logger& The logger. */ -spdlog::logger& logger(); +[[deprecated( + "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& +logger(); } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 0daebc0dd8d..914731ea417 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -236,26 +236,26 @@ struct host_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -264,7 +264,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -274,7 +274,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector const& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -285,7 +285,7 @@ struct host_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr host_span(host_span const& other) noexcept : base(other.data(), other.size()) @@ -333,26 +333,26 @@ struct device_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -362,7 +362,7 @@ struct device_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr device_span(device_span const& other) noexcept : base(other.data(), other.size()) diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh index 1758790cd64..c259d61060b 100644 --- a/cpp/include/cudf_test/tdigest_utilities.cuh +++ b/cpp/include/cudf_test/tdigest_utilities.cuh @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op) static_cast(values).type(), tdigest_gen{}, op, values, delta); // NOTE: an empty tdigest column still has 1 row. - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); } @@ -562,12 +562,12 @@ template void tdigest_merge_empty(MergeFunc merge_op) { // 3 empty tdigests all in the same group - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto b = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto b = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); cols.push_back(*b); @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op) auto const delta = 1000; auto result = merge_op(*values, delta); - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp index 1793a8ecce0..3c96c59f0b7 100644 --- a/cpp/include/cudf_test/type_list_utilities.hpp +++ b/cpp/include/cudf_test/type_list_utilities.hpp @@ -414,9 +414,8 @@ struct RemoveIfImpl> { template struct RemoveIfImpl> { - using type = - Concat::value, Types<>, Types>::type, - typename RemoveIfImpl>::type>; + using type = Concat::value, Types<>, Types>, + typename RemoveIfImpl>::type>; }; // @endcond diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp index 723ba310a1e..dca590baebf 100644 --- a/cpp/include/nvtext/edit_distance.hpp +++ b/cpp/include/nvtext/edit_distance.hpp @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext { * @param targets Strings to compute edit distance against `input` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New lists column of edit distance values */ std::unique_ptr edit_distance( cudf::strings_column_view const& input, diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index 02998b84ffd..d915c85bf85 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,13 @@ */ #include +#include +#include #include +#include + namespace cudf { namespace detail { void initialize_with_identity(mutable_table_view& table, diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ddb0dbcd96d..a497cedb3bc 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -44,19 +44,6 @@ namespace cudf { namespace datetime { namespace detail { -enum class datetime_component { - INVALID = 0, - YEAR, - MONTH, - DAY, - WEEKDAY, - HOUR, - MINUTE, - SECOND, - MILLISECOND, - MICROSECOND, - NANOSECOND -}; enum class rounding_function { CEIL, ///< Rounds up to the next integer multiple of the provided frequency @@ -453,90 +440,70 @@ std::unique_ptr extract_year(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::YEAR, stream, mr); } std::unique_ptr extract_month(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MONTH, stream, mr); } std::unique_ptr extract_day(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::DAY, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::WEEKDAY, stream, mr); } std::unique_ptr extract_hour(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::HOUR, stream, mr); } std::unique_ptr extract_minute(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MINUTE, stream, mr); } std::unique_ptr extract_second(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::SECOND, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MILLISECOND, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MICROSECOND, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::NANOSECOND, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, @@ -576,6 +543,32 @@ std::unique_ptr extract_quarter(column_view const& column, return apply_datetime_op(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ +#define extract(field) \ + case field: \ + return apply_datetime_op, cudf::type_id::INT16>( \ + column, stream, mr) + + switch (component) { + extract(datetime_component::YEAR); + extract(datetime_component::MONTH); + extract(datetime_component::DAY); + extract(datetime_component::WEEKDAY); + extract(datetime_component::HOUR); + extract(datetime_component::MINUTE); + extract(datetime_component::SECOND); + extract(datetime_component::MILLISECOND); + extract(datetime_component::MICROSECOND); + extract(datetime_component::NANOSECOND); + default: CUDF_FAIL("Unsupported datetime component."); + } +#undef extract +} + } // namespace detail std::unique_ptr ceil_datetimes(column_view const& column, @@ -661,6 +654,15 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_datetime_component(column, component, stream, mr); +} + std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index cf239297255..a6b6cbbf0b5 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -38,7 +38,7 @@ std::string const tzif_system_directory = "/usr/share/zoneinfo/"; struct timezone_file_header { uint32_t magic; ///< "TZif" uint8_t version; ///< 0:version1, '2':version2, '3':version3 - uint8_t reserved15[15]; ///< unused, reserved for future use + uint8_t reserved15[15]; ///< unused, reserved for future use // NOLINT uint32_t isutccnt; ///< number of UTC/local indicators contained in the body uint32_t isstdcnt; ///< number of standard/wall indicators contained in the body uint32_t leapcnt; ///< number of leap second records contained in the body diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp index 4906e5b4f9c..3e4a201bba4 100644 --- a/cpp/src/dictionary/dictionary_column_view.cpp +++ b/cpp/src/dictionary/dictionary_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,8 +36,7 @@ column_view dictionary_column_view::indices() const noexcept { return child(0); column_view dictionary_column_view::get_indices_annotated() const noexcept { - return column_view( - indices().type(), size(), indices().head(), null_mask(), null_count(), offset()); + return {indices().type(), size(), indices().head(), null_mask(), null_count(), offset()}; } column_view dictionary_column_view::keys() const noexcept { return child(1); } diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index 9abfe22950a..188d0cff3f1 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -18,8 +18,8 @@ #include "multi_pass_kernels.cuh" -#include #include +#include #include #include @@ -100,7 +100,7 @@ struct compute_single_pass_aggs_fn { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { auto const result = set.insert_and_find(i); - cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } } }; diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 2358f47bbbb..f9adfc6060e 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index ba5b11b90d8..a1be6aade4e 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -118,8 +118,8 @@ DLDataType data_type_to_DLDataType(data_type type) // Context object to own memory allocated for DLManagedTensor struct dltensor_context { - int64_t shape[2]; - int64_t strides[2]; + int64_t shape[2]; // NOLINT + int64_t strides[2]; // NOLINT rmm::device_buffer buffer; static void deleter(DLManagedTensor* arg) diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 2041f03cd81..d5caa4720ac 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -16,6 +16,7 @@ #include "avro.hpp" +#include #include #include @@ -199,7 +200,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) // Read the next sync markers and ensure they match the first ones we // encountered. If they don't, we have to assume the data is corrupted, // and thus, we terminate processing immediately. - uint64_t const sync_marker[] = {get_raw(), get_raw()}; + std::array const sync_marker = {get_raw(), get_raw()}; bool valid_sync_markers = ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1])); if (!valid_sync_markers) { return false; } @@ -302,7 +303,7 @@ bool schema_parser::parse(std::vector& schema, std::string const& // Empty schema if (json_str == "[]") return true; - char depthbuf[MAX_SCHEMA_DEPTH]; + std::array depthbuf; int depth = 0, parent_idx = -1, entry_idx = -1; json_state_e state = state_attrname; std::string str; diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp index f2813a1ba51..2e992546ccc 100644 --- a/cpp/src/io/avro/avro.hpp +++ b/cpp/src/io/avro/avro.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "avro_common.hpp" #include +#include #include #include #include @@ -100,15 +101,15 @@ struct column_desc { */ struct file_metadata { std::map user_data; - std::string codec = ""; - uint64_t sync_marker[2] = {0, 0}; - size_t metadata_size = 0; - size_t total_data_size = 0; - size_t selected_data_size = 0; - size_type num_rows = 0; - size_type skip_rows = 0; - size_type total_num_rows = 0; - uint32_t max_block_size = 0; + std::string codec = ""; + std::array sync_marker = {0, 0}; + size_t metadata_size = 0; + size_t total_data_size = 0; + size_t selected_data_size = 0; + size_type num_rows = 0; + size_type skip_rows = 0; + size_type total_num_rows = 0; + uint32_t max_block_size = 0; std::vector schema; std::vector block_list; std::vector columns; diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 602ff1734b6..1af45b41d8e 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -42,7 +42,7 @@ struct gz_file_header_s { uint8_t id2; // 0x8b uint8_t comp_mthd; // compression method (0-7=reserved, 8=deflate) uint8_t flags; // flags (GZIPHeaderFlag) - uint8_t mtime[4]; // If non-zero: modification time (Unix format) + uint8_t mtime[4]; // If non-zero: modification time (Unix format) // NOLINT uint8_t xflags; // Extra compressor-specific flags uint8_t os; // OS id }; @@ -103,7 +103,7 @@ struct zip_lfh_s { }; struct bz2_file_header_s { - uint8_t sig[3]; // "BZh" + uint8_t sig[3]; // "BZh" // NOLINT uint8_t blksz; // block size 1..9 in 100kB units (post-RLE) }; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index ebca334a715..8c32fc85f78 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -46,11 +46,8 @@ #include #include -#include #include -#include #include -#include #include #include #include @@ -88,7 +85,7 @@ class selected_rows_offsets { : all{std::move(data)}, selected{selected_span} { } - selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} + explicit selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} operator device_span() const { return selected; } void shrink(size_t size) @@ -196,15 +193,11 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) container.resize(1, stream); } -size_t find_first_row_start(char row_terminator, host_span data) +constexpr std::array UTF8_BOM = {0xEF, 0xBB, 0xBF}; +[[nodiscard]] bool has_utf8_bom(host_span data) { - // For now, look for the first terminator (assume the first terminator isn't within a quote) - // TODO: Attempt to infer this from the data - size_t pos = 0; - while (pos < data.size() && data[pos] != row_terminator) { - ++pos; - } - return std::min(pos + 1, data.size()); + return data.size() >= UTF8_BOM.size() && + memcmp(data.data(), UTF8_BOM.data(), UTF8_BOM.size()) == 0; } /** @@ -213,20 +206,28 @@ size_t find_first_row_start(char row_terminator, host_span data) * This function scans the input data to record the row offsets (relative to the start of the * input data). A row is actually the data/offset between two termination symbols. * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory + * @param[in] source The source data (may be compressed) + * @param[in] reader_opts Settings for controlling reading behavior + * @param[in] parse_opts Settings for controlling parsing behavior + * @param[out] header The header row, if any + * @param[in] data Host buffer containing uncompressed data, if input is compressed + * @param[in] byte_range_offset Offset of the byte range + * @param[in] range_begin Start of the first row, relative to the byte range start + * @param[in] range_end End of the data to read, relative to the byte range start; equal to the + * data size if all data after byte_range_offset needs to be read + * @param[in] skip_rows Number of rows to skip from the start + * @param[in] num_rows Number of rows to read; -1 means all + * @param[in] load_whole_file Indicates if the whole file should be read + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @return Input data and row offsets in the device memory */ std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( + cudf::io::datasource* source, csv_reader_options const& reader_opts, parse_options const& parse_opts, std::vector& header, - host_span data, + std::optional> data, + size_t byte_range_offset, size_t range_begin, size_t range_end, size_t skip_rows, @@ -235,50 +236,81 @@ std::pair, selected_rows_offsets> load_data_and_gather rmm::cuda_stream_view stream) { constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB - size_t buffer_size = std::min(max_chunk_bytes, data.size()); - size_t max_blocks = - std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); - cudf::detail::hostdevice_vector row_ctx(max_blocks, stream); - size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); - size_t pos = std::min(range_begin, data.size()); - size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; - uint64_t ctx = 0; + + auto const data_size = data.has_value() ? data->size() : source->size(); + auto const buffer_size = std::min(max_chunk_bytes, data_size); + auto const max_input_size = [&] { + if (range_end == data_size) { + return data_size - byte_range_offset; + } else { + return std::min(reader_opts.get_byte_range_size_with_padding(), + data_size - byte_range_offset); + } + }(); + auto const header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; // For compatibility with the previous parser, a row is considered in-range if the // previous row terminator is within the given range - range_end += (range_end < data.size()); + range_end += (range_end < data_size); - // Reserve memory by allocating and then resetting the size - rmm::device_uvector d_data{ - (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream}; - d_data.resize(0, stream); + auto pos = range_begin; + // When using byte range, need the line terminator of last line before the range + auto input_pos = byte_range_offset == 0 ? pos : pos - 1; + uint64_t ctx = 0; + + rmm::device_uvector d_data{0, stream}; + d_data.reserve((load_whole_file) ? data_size : std::min(buffer_size * 2, max_input_size), stream); rmm::device_uvector all_row_offsets{0, stream}; + + auto const max_blocks = + std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); + cudf::detail::hostdevice_vector row_ctx(max_blocks, stream); do { - size_t target_pos = std::min(pos + max_chunk_bytes, data.size()); - size_t chunk_size = target_pos - pos; + auto const target_pos = std::min(pos + max_chunk_bytes, max_input_size); + auto const chunk_size = target_pos - pos; auto const previous_data_size = d_data.size(); - d_data.resize(target_pos - buffer_pos, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size, - data.begin() + buffer_pos + previous_data_size, - target_pos - buffer_pos - previous_data_size, - cudaMemcpyDefault, - stream.value())); + d_data.resize(target_pos - input_pos, stream); + + auto const read_offset = byte_range_offset + input_pos + previous_data_size; + auto const read_size = target_pos - input_pos - previous_data_size; + if (data.has_value()) { + CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size, + data->data() + read_offset, + target_pos - input_pos - previous_data_size, + cudaMemcpyDefault, + stream.value())); + } else { + if (source->is_device_read_preferred(read_size)) { + source->device_read(read_offset, + read_size, + reinterpret_cast(d_data.data() + previous_data_size), + stream); + } else { + auto const buffer = source->host_read(read_offset, read_size); + CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size, + buffer->data(), + buffer->size(), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); // To prevent buffer going out of scope before we copy the data. + } + } // Pass 1: Count the potential number of rows in each character block for each // possible parser state at the beginning of the block. - uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), - row_ctx.device_ptr(), - device_span(), - d_data, - chunk_size, - pos, - buffer_pos, - data.size(), - range_begin, - range_end, - skip_rows, - stream); + auto const num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), + row_ctx.device_ptr(), + device_span(), + d_data, + chunk_size, + pos, + input_pos, + max_input_size, + range_begin, + range_end, + skip_rows, + stream); CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), row_ctx.device_ptr(), num_blocks * sizeof(uint64_t), @@ -312,14 +344,14 @@ std::pair, selected_rows_offsets> load_data_and_gather d_data, chunk_size, pos, - buffer_pos, - data.size(), + input_pos, + max_input_size, range_begin, range_end, skip_rows, stream); // With byte range, we want to keep only one row out of the specified range - if (range_end < data.size()) { + if (range_end < data_size) { CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), row_ctx.device_ptr(), num_blocks * sizeof(uint64_t), @@ -356,18 +388,18 @@ std::pair, selected_rows_offsets> load_data_and_gather size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char); if (discard_bytes != 0) { erase_except_last(d_data, stream); - buffer_pos += discard_bytes; + input_pos += discard_bytes; } } pos = target_pos; - } while (pos < data.size()); + } while (pos < max_input_size); auto const non_blank_row_offsets = io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream); auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets}; // Remove header rows and extract header - size_t const header_row_index = std::max(header_rows, 1) - 1; + auto const header_row_index = std::max(header_rows, 1) - 1; if (header_row_index + 1 < row_offsets.size()) { CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), row_offsets.data() + header_row_index, @@ -376,11 +408,20 @@ std::pair, selected_rows_offsets> load_data_and_gather stream.value())); stream.synchronize(); - auto const header_start = buffer_pos + row_ctx[0]; - auto const header_end = buffer_pos + row_ctx[1]; - CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), + auto const header_start = input_pos + row_ctx[0]; + auto const header_end = input_pos + row_ctx[1]; + CUDF_EXPECTS(header_start <= header_end && header_end <= max_input_size, "Invalid csv header location"); - header.assign(data.begin() + header_start, data.begin() + header_end); + header.resize(header_end - header_start); + if (data.has_value()) { + std::copy(data->begin() + byte_range_offset + header_start, + data->begin() + byte_range_offset + header_end, + header.begin()); + } else { + source->host_read(header_start + byte_range_offset, + header_end - header_start, + reinterpret_cast(header.data())); + } if (header_rows > 0) { row_offsets.erase_first_n(header_rows); } } // Apply num_rows limit @@ -397,73 +438,89 @@ std::pair, selected_rows_offsets> select_data_and_row_ parse_options const& parse_opts, rmm::cuda_stream_view stream) { - auto range_offset = reader_opts.get_byte_range_offset(); - auto range_size = reader_opts.get_byte_range_size(); - auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - auto skip_rows = reader_opts.get_skiprows(); - auto skip_end_rows = reader_opts.get_skipfooter(); - auto num_rows = reader_opts.get_nrows(); + auto range_offset = reader_opts.get_byte_range_offset(); + auto range_size = reader_opts.get_byte_range_size(); + auto skip_rows = reader_opts.get_skiprows(); + auto skip_end_rows = reader_opts.get_skipfooter(); + auto num_rows = reader_opts.get_nrows(); if (range_offset > 0 || range_size > 0) { CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } + // TODO: Allow parsing the header outside the mapped range + CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), + "byte_range offset with header not supported"); - // Transfer source data to GPU - if (!source->is_empty()) { - auto buffer = - source->host_read(range_offset, range_size_padded != 0 ? range_size_padded : source->size()); - auto h_data = - host_span(reinterpret_cast(buffer->data()), buffer->size()); - - std::vector h_uncomp_data_owner; - if (reader_opts.get_compression() != compression_type::NONE) { - h_uncomp_data_owner = - decompress(reader_opts.get_compression(), {buffer->data(), buffer->size()}); - h_data = {reinterpret_cast(h_uncomp_data_owner.data()), - h_uncomp_data_owner.size()}; - buffer.reset(); - } + if (source->is_empty()) { + return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; + } - // check for and skip UTF-8 BOM - uint8_t const UTF8_BOM[] = {0xEF, 0xBB, 0xBF}; - if (h_data.size() >= sizeof(UTF8_BOM) && - memcmp(h_data.data(), UTF8_BOM, sizeof(UTF8_BOM)) == 0) { - h_data = h_data.subspan(sizeof(UTF8_BOM), h_data.size() - sizeof(UTF8_BOM)); - } + std::optional> h_data; + std::vector h_uncomp_data_owner; + if (reader_opts.get_compression() != compression_type::NONE) { + auto const h_comp_data = source->host_read(0, source->size()); + h_uncomp_data_owner = + decompress(reader_opts.get_compression(), {h_comp_data->data(), h_comp_data->size()}); + h_data = host_span{reinterpret_cast(h_uncomp_data_owner.data()), + h_uncomp_data_owner.size()}; + } - // None of the parameters for row selection is used, we are parsing the entire file - bool const load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && - skip_end_rows <= 0 && num_rows == -1; - - // With byte range, find the start of the first data row - size_t const data_start_offset = - (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0; - - // TODO: Allow parsing the header outside the mapped range - CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), - "byte_range offset with header not supported"); - - // Gather row offsets - auto data_row_offsets = - load_data_and_gather_row_offsets(reader_opts, - parse_opts, - header, - h_data, - data_start_offset, - (range_size) ? range_size : h_data.size(), - (skip_rows > 0) ? skip_rows : 0, - num_rows, - load_whole_file, - stream); - auto& row_offsets = data_row_offsets.second; - // Exclude the rows that are to be skipped from the end - if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { - row_offsets.shrink(row_offsets.size() - skip_end_rows); + size_t data_start_offset = range_offset; + if (h_data.has_value()) { + if (has_utf8_bom(*h_data)) { data_start_offset += sizeof(UTF8_BOM); } + } else { + if (range_offset == 0) { + auto bom_buffer = source->host_read(0, std::min(source->size(), sizeof(UTF8_BOM))); + auto bom_chars = host_span{reinterpret_cast(bom_buffer->data()), + bom_buffer->size()}; + if (has_utf8_bom(bom_chars)) { data_start_offset += sizeof(UTF8_BOM); } + } else { + auto find_data_start_chunk_size = 1024ul; + while (data_start_offset < source->size()) { + auto const read_size = + std::min(find_data_start_chunk_size, source->size() - data_start_offset); + auto buffer = source->host_read(data_start_offset, read_size); + auto buffer_chars = + host_span{reinterpret_cast(buffer->data()), buffer->size()}; + + if (auto first_row_start = + std::find(buffer_chars.begin(), buffer_chars.end(), parse_opts.terminator); + first_row_start != buffer_chars.end()) { + data_start_offset += std::distance(buffer_chars.begin(), first_row_start) + 1; + break; + } + data_start_offset += read_size; + find_data_start_chunk_size *= 2; + } } - return data_row_offsets; } - return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; + + // None of the parameters for row selection is used, we are parsing the entire file + bool const load_whole_file = + range_offset == 0 && range_size == 0 && skip_rows <= 0 && skip_end_rows <= 0 && num_rows == -1; + + // Transfer source data to GPU and gather row offsets + auto const uncomp_size = h_data.has_value() ? h_data->size() : source->size(); + auto data_row_offsets = load_data_and_gather_row_offsets(source, + reader_opts, + parse_opts, + header, + h_data, + range_offset, + data_start_offset - range_offset, + (range_size) ? range_size : uncomp_size, + (skip_rows > 0) ? skip_rows : 0, + num_rows, + load_whole_file, + stream); + auto& row_offsets = data_row_offsets.second; + // Exclude the rows that are to be skipped from the end + if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { + row_offsets.shrink(row_offsets.size() - skip_end_rows); + } + + return data_row_offsets; } void select_data_types(host_span user_dtypes, diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 0ca54da5aaf..5a060902eb2 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -38,6 +38,7 @@ #include #include +#include namespace cudf::io { @@ -121,14 +122,16 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { std::vector> make_datasources(source_info const& info, - size_t range_offset = 0, - size_t range_size = 0) + size_t offset = 0, + size_t max_size_estimate = 0, + size_t min_size_estimate = 0) { switch (info.type()) { case io_type::FILEPATH: { auto sources = std::vector>(); for (auto const& filepath : info.filepaths()) { - sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + sources.emplace_back( + cudf::io::datasource::create(filepath, offset, max_size_estimate, min_size_estimate)); } return sources; } @@ -210,7 +213,8 @@ table_with_metadata read_json(json_reader_options options, auto datasources = make_datasources(options.get_source(), options.get_byte_range_offset(), - options.get_byte_range_size_with_padding()); + options.get_byte_range_size_with_padding(), + options.get_byte_range_size()); return json::detail::read_json(datasources, options, stream, mr); } @@ -237,7 +241,8 @@ table_with_metadata read_csv(csv_reader_options options, auto datasources = make_datasources(options.get_source(), options.get_byte_range_offset(), - options.get_byte_range_size_with_padding()); + options.get_byte_range_size_with_padding(), + options.get_byte_range_size()); CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported."); @@ -852,8 +857,8 @@ void parquet_writer_options_base::set_sorting_columns(std::vector, hashmap_of_device_columns> build_tree is_mixed_type_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async( + d_column_tree.node_categories, column_categories, stream); } // ignore all children of columns forced as string @@ -653,11 +650,7 @@ std::pair, hashmap_of_device_columns> build_tree forced_as_string_column[this_col_id]) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, column_categories, stream); // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 1c15e147b13..76816071d8c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -618,12 +618,14 @@ struct PdaSymbolToSymbolGroupId { constexpr auto pda_sgid_lookup_size = static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])); // We map the delimiter character to LINE_BREAK symbol group id, and the newline character - // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, + // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, // escape, comma, colon or whitespace characters. + auto constexpr newline = '\n'; + auto constexpr whitespace = ' '; auto const symbol_position = symbol == delimiter - ? static_cast('\n') - : (symbol == '\n' ? static_cast(delimiter) : static_cast(symbol)); + ? static_cast(newline) + : (symbol == newline ? static_cast(whitespace) : static_cast(symbol)); PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)]; return stack_idx * static_cast(symbol_group_id::NUM_PDA_INPUT_SGS) + diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 83c7b663980..d41d137a2c9 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -87,13 +88,25 @@ void validate_token_stream(device_span d_input, { CUDF_FUNC_RANGE(); if (!options.is_strict_validation()) { return; } + + rmm::device_uvector d_invalid = cudf::detail::make_zeroed_device_uvector_async( + tokens.size(), stream, cudf::get_current_device_resource_ref()); + using token_t = cudf::io::json::token_t; - cudf::detail::optional_trie trie_na = - cudf::detail::create_serialized_trie(options.get_na_values(), stream); - auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto literals = options.get_na_values(); + literals.emplace_back("null"); // added these too to single trie + literals.emplace_back("true"); + literals.emplace_back("false"); + + cudf::detail::optional_trie trie_literals = + cudf::detail::create_serialized_trie(literals, stream); + cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie( + {"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream); + auto validate_values = cuda::proclaim_return_type( [data = d_input.data(), - trie_na = trie_na_view, + trie_literals = cudf::detail::make_trie_view(trie_literals), + trie_nonnumeric = cudf::detail::make_trie_view(trie_nonnumeric), allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), allow_nonnumeric = options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, @@ -101,24 +114,15 @@ void validate_token_stream(device_span d_input, // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - auto c = data[start]; - auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); - if (is_null_literal) { - return true; - } else if ('n' == c) { - return substr_eq(data, start, end, 4, "null"); - } else if ('t' == c) { - return substr_eq(data, start, end, 4, "true"); - } else if ('f' == c) { - return substr_eq(data, start, end, 5, "false"); - } else if (allow_nonnumeric && c == 'N') { - return substr_eq(data, start, end, 3, "NaN"); - } else if (allow_nonnumeric && c == 'I') { - return substr_eq(data, start, end, 8, "Infinity"); - } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 4, "+INF") || - substr_eq(data, start, end, 9, "+Infinity"); - } else if ('-' == c || c <= '9' && 'c' >= '0') { + auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start}); + if (is_literal) { return true; } + if (allow_nonnumeric) { + auto const is_nonnumeric = + serialized_trie_contains(trie_nonnumeric, {data + start, end - start}); + if (is_nonnumeric) { return true; } + } + auto c = data[start]; + if ('-' == c || c <= '9' && 'c' >= '0') { // number auto num_state = number_state::START; for (auto at = start; at < end; at++) { @@ -140,9 +144,6 @@ void validate_token_stream(device_span d_input, num_state = number_state::LEADING_ZERO; } else if (c >= '1' && c <= '9') { num_state = number_state::WHOLE; - } else if (allow_nonnumeric && 'I' == c) { - return substr_eq(data, start, end, 4, "-INF") || - substr_eq(data, start, end, 9, "-Infinity"); } else { return false; } @@ -273,33 +274,44 @@ void validate_token_stream(device_span d_input, auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); - auto predicate = [tokens = tokens.begin(), - token_indices = token_indices.begin(), - validate_values, - validate_strings] __device__(auto i) -> bool { + auto predicate = cuda::proclaim_return_type([tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_values, + validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { return !validate_values(token_indices[i - 1], token_indices[i]); } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; - }; + }); + + auto conditional_invalidout_it = + cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type( + [d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void { + if (x) { d_invalid[i] = true; } + })); + thrust::transform(rmm::exec_policy_nosync(stream), + count_it, + count_it + num_tokens, + conditional_invalidout_it, + predicate); using scan_type = write_if::scan_type; auto conditional_write = write_if{tokens.begin(), num_tokens}; auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); - auto transform_op = cuda::proclaim_return_type( - [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { - if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; - return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; - }); - auto binary_op = cuda::proclaim_return_type( + auto binary_op = cuda::proclaim_return_type( [] __device__(scan_type prev, scan_type curr) -> scan_type { auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); - return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + return {(curr.second ? curr.first : op_result), prev.second | curr.second}; + }); + auto transform_op = cuda::proclaim_return_type( + [d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; }); - thrust::transform_inclusive_scan(rmm::exec_policy(stream), + thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream), count_it, count_it + num_tokens, conditional_output_it, diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index dc7199d7ab1..e1241f8f90c 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -170,6 +170,9 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + if (column_v.is_empty()) { // empty begets empty + return make_empty_column(type_id::STRING); + } auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 790532c9d54..5ab36fdae8e 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -258,7 +258,7 @@ class ProtobufReader { private: template - friend class FunctionSwitchImpl; + friend struct FunctionSwitchImpl; void skip_bytes(size_t bytecnt) { diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 5c70e35fd2e..ed0b6969154 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -20,6 +20,8 @@ #include "orc_gpu.hpp" #include +#include +#include #include #include #include @@ -1087,37 +1089,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) /** * @brief Merge chunked column data into a single contiguous stream * - * @param[in,out] strm_desc StripeStream device array [stripe][stream] - * @param[in,out] streams List of encoder chunk streams [column][rowgroup] + * @param[in] strm_desc StripeStream device array [stripe][stream] + * @param[in] streams List of encoder chunk streams [column][rowgroup] + * @param[out] srcs List of source encoder chunk stream data addresses + * @param[out] dsts List of destination StripeStream data addresses + * @param[out] sizes List of stream sizes in bytes */ // blockDim {compact_streams_block_size,1,1} CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) - gpuCompactOrcDataStreams(device_2dspan strm_desc, - device_2dspan streams) + gpuInitBatchedMemcpy(device_2dspan strm_desc, + device_2dspan streams, + device_span srcs, + device_span dsts, + device_span sizes) { - __shared__ __align__(16) StripeStream ss; - - auto const stripe_id = blockIdx.x; + auto const stripe_id = cudf::detail::grid_1d::global_thread_id(); auto const stream_id = blockIdx.y; - auto const t = threadIdx.x; + if (stripe_id >= strm_desc.size().first) { return; } - if (t == 0) { ss = strm_desc[stripe_id][stream_id]; } - __syncthreads(); + auto const out_id = stream_id * strm_desc.size().first + stripe_id; + StripeStream ss = strm_desc[stripe_id][stream_id]; if (ss.data_ptr == nullptr) { return; } auto const cid = ss.stream_type; auto dst_ptr = ss.data_ptr; for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) { + auto const out_id = stream_id * streams.size().second + group; + srcs[out_id] = streams[ss.column_id][group].data_ptrs[cid]; + dsts[out_id] = dst_ptr; + + // Also update the stream here, data will be copied in a separate kernel + streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; + auto const len = streams[ss.column_id][group].lengths[cid]; - if (len > 0) { - auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid]; - for (uint32_t i = t; i < len; i += blockDim.x) { - dst_ptr[i] = src_ptr[i]; - } - __syncthreads(); - } - if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; } + // len is the size (in bytes) of the current stream. + sizes[out_id] = len; dst_ptr += len; } } @@ -1325,9 +1332,26 @@ void CompactOrcDataStreams(device_2dspan strm_desc, device_2dspan enc_streams, rmm::cuda_stream_view stream) { + auto const num_rowgroups = enc_streams.size().second; + auto const num_streams = strm_desc.size().second; + auto const num_stripes = strm_desc.size().first; + auto const num_chunks = num_rowgroups * num_streams; + auto srcs = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto dsts = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto lengths = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + dim3 dim_block(compact_streams_block_size, 1); - dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); - gpuCompactOrcDataStreams<<>>(strm_desc, enc_streams); + dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), + strm_desc.size().second); + gpuInitBatchedMemcpy<<>>( + strm_desc, enc_streams, srcs, dsts, lengths); + + // Copy streams in a batched manner. + cudf::detail::batched_memcpy_async( + srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream); } std::optional CompressOrcDataStreams( diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index b978799b8bc..312a5243687 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -228,7 +228,8 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { +class parquet_field_string_list : public parquet_field_list { + public: parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { @@ -396,8 +397,9 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list +class parquet_field_binary_list : public parquet_field_list, FieldType::BINARY> { + public: parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e0d50d7ccf9..b3276c81c1f 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -17,6 +17,8 @@ #include "page_data.cuh" #include "page_decode.cuh" +#include + #include #include @@ -466,4 +468,28 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, } } +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream) +{ + // Copy offsets to device and create an iterator + auto d_src_data = cudf::detail::make_device_uvector_async( + offsets, stream, cudf::get_current_device_resource_ref()); + // Iterator for the source (scalar) data + auto src_iter = cudf::detail::make_counting_transform_iterator( + static_cast(0), + cuda::proclaim_return_type( + [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; })); + + // Copy buffer addresses to device and create an iterator + auto d_dst_addrs = cudf::detail::make_device_uvector_async( + buff_addrs, stream, cudf::get_current_device_resource_ref()); + // size_iter is simply a constant iterator of sizeof(size_type) bytes. + auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); + + // Copy offsets to buffers in batched manner. + cudf::detail::batched_memcpy_async( + src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream); +} + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index a77a5f5ad50..4c4c3b3fbed 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -297,7 +297,8 @@ struct PageInfo { int32_t uncompressed_page_size; // uncompressed data size in bytes // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length // indicator. instead the lengths for these are stored in the header. - int32_t lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) + int32_t // NOLINT + lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) // Number of values in this data page or dictionary. // Important : the # of input values does not necessarily // correspond to the number of rows in the output. It just reflects the number @@ -348,7 +349,7 @@ struct PageInfo { PageNestingDecodeInfo* nesting_decode; // level decode buffers - uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; + uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; // NOLINT // temporary space for decoding DELTA_BYTE_ARRAY encoded strings int64_t temp_string_size; @@ -434,14 +435,14 @@ struct ColumnChunkDesc { size_t num_values{}; // total number of values in this column size_t start_row{}; // file-wide, absolute starting row of this chunk uint32_t num_rows{}; // number of rows in this chunk - int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level - int16_t max_nesting_depth{}; // max nesting depth of the output - int32_t type_length{}; // type length from schema (for FLBA only) - Type physical_type{}; // parquet physical data type - uint8_t - level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels - int32_t num_data_pages{}; // number of data pages - int32_t num_dict_pages{}; // number of dictionary pages + int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level // NOLINT + int16_t max_nesting_depth{}; // max nesting depth of the output + int32_t type_length{}; // type length from schema (for FLBA only) + Type physical_type{}; // parquet physical data type + uint8_t level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max // NOLINT + // definition/repetition levels + int32_t num_data_pages{}; // number of data pages + int32_t num_dict_pages{}; // number of dictionary pages PageInfo const* dict_page{}; string_index_pair* str_dict_index{}; // index for string dictionary bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column @@ -799,6 +800,18 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span pages, kernel_error::pointer error_code, rmm::cuda_stream_view stream); +/** + * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a + * batched manner. + * + * @param offsets Host span of final offsets + * @param buff_addrs Host span of corresponding output col buffer end addresses + * @param stream CUDA stream to use + */ +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for reading the string column data stored in the pages * diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 9f66160f73c..1153c9b9520 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -416,13 +416,15 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } - // for list columns, add the final offset to every offset buffer. - // TODO : make this happen in more efficiently. Maybe use thrust::for_each - // on each buffer. + // For list and string columns, add the final offset to every offset buffer. // Note : the reason we are doing this here instead of in the decode kernel is // that it is difficult/impossible for a given page to know that it is writing the very // last value that should then be followed by a terminator (because rows can span // page boundaries). + std::vector out_buffers; + std::vector final_offsets; + out_buffers.reserve(_input_columns.size()); + final_offsets.reserve(_input_columns.size()); for (size_t idx = 0; idx < _input_columns.size(); idx++) { input_column_info const& input_col = _input_columns[idx]; @@ -438,25 +440,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // the final offset for a list at level N is the size of it's child size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), - &offset, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + (out_buf.size - 1)); + final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column auto const sz = static_cast(col_string_sizes[idx]); if (sz <= strings::detail::get_offset64_threshold()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, - &sz, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); + final_offsets.emplace_back(sz); } } } } + // Write the final offsets for list and string columns in a batched manner + WriteFinalOffsets(final_offsets, out_buffers, _stream); // update null counts in the final column buffers for (size_t idx = 0; idx < subpass.pages.size(); idx++) { diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 2d46da14bec..62ffc4d3077 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -188,10 +188,10 @@ class reader::impl { * * Does not decompress the chunk data. * - * @return pair of boolean indicating if compressed chunks were found and a vector of futures for + * @return pair of boolean indicating if compressed chunks were found and a future for * read completion */ - std::pair>> read_column_chunks(); + std::pair> read_column_chunks(); /** * @brief Read compressed data and page information for the current pass. diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 8e67f233213..8cab68ea721 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -19,9 +19,9 @@ #include #include +#include #include #include -#include #include #include @@ -964,7 +964,7 @@ void reader::impl::allocate_level_decode_space() } } -std::pair>> reader::impl::read_column_chunks() +std::pair> reader::impl::read_column_chunks() { auto const& row_groups_info = _pass_itm_data->row_groups; @@ -989,7 +989,6 @@ std::pair>> reader::impl::read_column_chunks // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide // skip_rows/num_rows // auto remaining_rows = num_rows; - std::vector> read_chunk_tasks; size_type chunk_count = 0; for (auto const& rg : row_groups_info) { auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); @@ -1018,16 +1017,15 @@ std::pair>> reader::impl::read_column_chunks } // Read compressed chunk data to device memory - read_chunk_tasks.push_back(read_column_chunks_async(_sources, - raw_page_data, - chunks, - 0, - chunks.size(), - column_chunk_offsets, - chunk_source_map, - _stream)); - - return {total_decompressed_size > 0, std::move(read_chunk_tasks)}; + return {total_decompressed_size > 0, + read_column_chunks_async(_sources, + raw_page_data, + chunks, + 0, + chunks.size(), + column_chunk_offsets, + chunk_source_map, + _stream)}; } void reader::impl::read_compressed_data() @@ -1042,9 +1040,7 @@ void reader::impl::read_compressed_data() auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks(); pass.has_compressed_data = has_compressed_data; - for (auto& task : read_chunks_tasks) { - task.wait(); - } + read_chunks_tasks.wait(); // Process dataset chunk pages into output columns auto const total_pages = _has_page_index ? count_page_headers_with_pgidx(chunks, _stream) @@ -1660,9 +1656,9 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num } } - cudf::io::detail::batched_memset(memset_bufs, static_cast(0), _stream); + cudf::detail::batched_memset(memset_bufs, static_cast(0), _stream); // Need to set null mask bufs to all high bits - cudf::io::detail::batched_memset( + cudf::detail::batched_memset( nullmask_bufs, std::numeric_limits::max(), _stream); } diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 1dbb9369115..0b76f3d3e8f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -50,7 +50,8 @@ class file_sink : public data_sink { } } - ~file_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~file_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -114,7 +115,8 @@ class host_buffer_sink : public data_sink { public: explicit host_buffer_sink(std::vector* buffer) : buffer_(buffer) {} - ~host_buffer_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~host_buffer_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e4313eba454..0be976b6144 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -32,6 +32,7 @@ #include #include +#include namespace cudf { namespace io { @@ -54,6 +55,30 @@ class file_source : public datasource { } } + std::unique_ptr host_read(size_t offset, size_t size) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + ssize_t const read_size = std::min(size, _file.size() - offset); + + std::vector v(read_size); + CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); + return buffer::create(std::move(v)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + auto const read_size = std::min(size, _file.size() - offset); + + CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + "read failed"); + return read_size; + } + ~file_source() override = default; [[nodiscard]] bool supports_device_read() const override @@ -138,40 +163,63 @@ class file_source : public datasource { */ class memory_mapped_source : public file_source { public: - explicit memory_mapped_source(char const* filepath, size_t offset, size_t size) + explicit memory_mapped_source(char const* filepath, + size_t offset, + size_t max_size_estimate, + size_t min_size_estimate) : file_source(filepath) { if (_file.size() != 0) { - map(_file.desc(), offset, size); - register_mmap_buffer(); + // Memory mapping is not exclusive, so we can include the whole region we expect to read + map(_file.desc(), offset, max_size_estimate); + // Buffer registration is exclusive (can't overlap with other registered buffers) so we + // register the lower estimate; this avoids issues when reading adjacent ranges from the same + // file from multiple threads + register_mmap_buffer(offset, min_size_estimate); } } ~memory_mapped_source() override { if (_map_addr != nullptr) { - munmap(_map_addr, _map_size); + unmap(); unregister_mmap_buffer(); } } std::unique_ptr host_read(size_t offset, size_t size) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); + + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size); + } - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is only partially within the registered region, copy to a new + // host buffer to make the data safe to copy to the device + if (_reg_addr != nullptr and + (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) { + auto const src = static_cast(_map_addr) + (offset - _map_offset); + + return std::make_unique>>( + std::vector(src, src + read_size)); + } return std::make_unique( - static_cast(_map_addr) + (offset - _map_offset), read_size); + static_cast(_map_addr) + offset - _map_offset, read_size); } size_t host_read(size_t offset, size_t size, uint8_t* dst) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size, dst); + } auto const src = static_cast(_map_addr) + (offset - _map_offset); std::memcpy(dst, src, read_size); @@ -184,16 +232,18 @@ class memory_mapped_source : public file_source { * * Fixes nvbugs/4215160 */ - void register_mmap_buffer() + void register_mmap_buffer(size_t offset, size_t size) { - if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) { - return; - } + if (_map_addr == nullptr or not pageableMemoryAccessUsesHostPageTables()) { return; } - auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault); - if (result == cudaSuccess) { - _is_map_registered = true; - } else { + // Registered region must be within the mapped region + _reg_offset = std::max(offset, _map_offset); + _reg_size = std::min(size != 0 ? size : _map_size, (_map_offset + _map_size) - _reg_offset); + + _reg_addr = static_cast(_map_addr) - _map_offset + _reg_offset; + auto const result = cudaHostRegister(_reg_addr, _reg_size, cudaHostRegisterReadOnly); + if (result != cudaSuccess) { + _reg_addr = nullptr; CUDF_LOG_WARN("cudaHostRegister failed with {} ({})", static_cast(result), cudaGetErrorString(result)); @@ -205,10 +255,12 @@ class memory_mapped_source : public file_source { */ void unregister_mmap_buffer() { - if (not _is_map_registered) { return; } + if (_reg_addr == nullptr) { return; } - auto const result = cudaHostUnregister(_map_addr); - if (result != cudaSuccess) { + auto const result = cudaHostUnregister(_reg_addr); + if (result == cudaSuccess) { + _reg_addr = nullptr; + } else { CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})", static_cast(result), cudaGetErrorString(result)); @@ -226,52 +278,30 @@ class memory_mapped_source : public file_source { // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); + if (_map_size == 0) { return; } // Check if accessing a region within already mapped area _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); } - private: - size_t _map_size = 0; - size_t _map_offset = 0; - void* _map_addr = nullptr; - bool _is_map_registered = false; -}; - -/** - * @brief Implementation class for reading from a file using `read` calls - * - * Potentially faster than `memory_mapped_source` when only a small portion of the file is read - * through the host. - */ -class direct_read_source : public file_source { - public: - explicit direct_read_source(char const* filepath) : file_source(filepath) {} - - std::unique_ptr host_read(size_t offset, size_t size) override + void unmap() { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - ssize_t const read_size = std::min(size, _file.size() - offset); - - std::vector v(read_size); - CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); - return buffer::create(std::move(v)); + if (_map_addr != nullptr) { + auto const result = munmap(_map_addr, _map_size); + if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); } + _map_addr = nullptr; + } } - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - auto const read_size = std::min(size, _file.size() - offset); + private: + size_t _map_offset = 0; + size_t _map_size = 0; + void* _map_addr = nullptr; - CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), - "read failed"); - return read_size; - } + size_t _reg_offset = 0; + size_t _reg_size = 0; + void* _reg_addr = nullptr; }; /** @@ -431,16 +461,21 @@ class user_datasource_wrapper : public datasource { std::unique_ptr datasource::create(std::string const& filepath, size_t offset, - size_t size) + size_t max_size_estimate, + size_t min_size_estimate) { + CUDF_EXPECTS(max_size_estimate == 0 or min_size_estimate <= max_size_estimate, + "Invalid min/max size estimates for datasource creation"); + #ifdef CUFILE_FOUND if (cufile_integration::is_always_enabled()) { // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); + return std::make_unique(filepath.c_str()); } #endif // Use our own memory mapping implementation for direct file reads - return std::make_unique(filepath.c_str(), offset, size); + return std::make_unique( + filepath.c_str(), offset, max_size_estimate, min_size_estimate); } std::unique_ptr datasource::create(host_buffer const& buffer) diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp index d9eac423901..1d8b34addbd 100644 --- a/cpp/src/io/utilities/hostdevice_span.hpp +++ b/cpp/src/io/utilities/hostdevice_span.hpp @@ -43,8 +43,8 @@ class hostdevice_span { template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> + std::remove_pointer_t().host_ptr())> (*)[], // NOLINT + T (*)[]>>* = nullptr> // NOLINT constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) { } @@ -54,8 +54,8 @@ class hostdevice_span { template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> + std::remove_pointer_t().host_ptr())> (*)[], // NOLINT + T (*)[]>>* = nullptr> // NOLINT constexpr hostdevice_span(C const& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) { diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index aed745c42dd..634e6d78ebc 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,23 +125,17 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(d_data, h_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy(d_data, h_data, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(h_data, d_data, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy(h_data, d_data, stream); } /** * @brief Converts a hostdevice_vector into a hostdevice_span. diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh index f7e6de03354..8183a66f4f0 100644 --- a/cpp/src/io/utilities/output_builder.cuh +++ b/cpp/src/io/utilities/output_builder.cuh @@ -307,8 +307,8 @@ class output_builder { * @param mr The memory resource used to allocate the output vector. * @return The output vector. */ - rmm::device_uvector gather(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + [[nodiscard]] rmm::device_uvector gather(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { rmm::device_uvector output{size(), stream, mr}; auto output_it = output.begin(); diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp index 398c36821cc..519ac2d1a2e 100644 --- a/cpp/src/jit/parser.cpp +++ b/cpp/src/jit/parser.cpp @@ -19,8 +19,6 @@ #include #include -#include -#include #include #include #include @@ -28,7 +26,7 @@ namespace cudf { namespace jit { -constexpr char percent_escape[] = "_"; +constexpr char percent_escape[] = "_"; // NOLINT inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; } diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 0d017cf1f13..43c3b0a291b 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -292,32 +292,33 @@ std::unique_ptr make_tdigest_column(size_type num_rows, return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr); } -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr make_empty_tdigests_column(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto offsets = cudf::make_fixed_width_column( - data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); + data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), offsets->mutable_view().begin(), offsets->mutable_view().end(), 0); - auto min_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto min_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), min_col->mutable_view().begin(), min_col->mutable_view().end(), 0); - auto max_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto max_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), max_col->mutable_view().begin(), max_col->mutable_view().end(), 0); - return make_tdigest_column(1, - make_empty_column(type_id::FLOAT64), - make_empty_column(type_id::FLOAT64), + return make_tdigest_column(num_rows, + cudf::make_empty_column(type_id::FLOAT64), + cudf::make_empty_column(type_id::FLOAT64), std::move(offsets), std::move(min_col), std::move(max_col), @@ -338,7 +339,7 @@ std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto contents = make_empty_tdigest_column(stream, mr)->release(); + auto contents = make_empty_tdigests_column(1, stream, mr)->release(); return std::make_unique( std::move(*std::make_unique(std::move(contents.children))), true, stream, mr); } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index e1c1d2e3002..b0a84a6d50c 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -169,19 +169,19 @@ struct nearest_value_scalar_weights { */ template struct nearest_value_centroid_weights { - double const* cumulative_weights; - GroupOffsetsIter outer_offsets; // groups - size_type const* inner_offsets; // tdigests within a group + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupOffsetsIter group_offsets; // groups + size_type const* tdigest_offsets; // tdigests within a group thrust::pair operator() __device__(double next_limit, size_type group_index) const { - auto const tdigest_begin = outer_offsets[group_index]; - auto const tdigest_end = outer_offsets[group_index + 1]; - auto const num_weights = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + auto const tdigest_begin = group_offsets[group_index]; + auto const tdigest_end = group_offsets[group_index + 1]; + auto const num_weights = tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin]; // NOTE: as it is today, this functor will never be called for any digests that are empty, but // I'll leave this check here for safety. if (num_weights == 0) { return thrust::pair{0, 0}; } - double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin]; + double const* group_cumulative_weights = cumulative_weights + tdigest_offsets[tdigest_begin]; auto const index = ((thrust::lower_bound(thrust::seq, group_cumulative_weights, @@ -235,21 +235,26 @@ struct cumulative_scalar_weight { */ template struct cumulative_centroid_weight { - double const* cumulative_weights; - GroupLabelsIter group_labels; - GroupOffsetsIter outer_offsets; // groups - cudf::device_span inner_offsets; // tdigests with a group - + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupLabelsIter group_labels; // group labels for each tdigest including empty ones + GroupOffsetsIter group_offsets; // groups + cudf::device_span tdigest_offsets; // tdigests with a group + + /** + * @brief Returns the cumulative weight for a given value index. The index `n` is the index of + * `n`-th non-empty cluster. + */ std::tuple operator() __device__(size_type value_index) const { auto const tdigest_index = static_cast( - thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) - - inner_offsets.begin()) - + thrust::upper_bound( + thrust::seq, tdigest_offsets.begin(), tdigest_offsets.end(), value_index) - + tdigest_offsets.begin()) - 1; auto const group_index = group_labels[tdigest_index]; - auto const first_tdigest_index = outer_offsets[group_index]; - auto const first_weight_index = inner_offsets[first_tdigest_index]; + auto const first_tdigest_index = group_offsets[group_index]; + auto const first_weight_index = tdigest_offsets[first_tdigest_index]; auto const relative_value_index = value_index - first_weight_index; double const* group_cumulative_weights = cumulative_weights + first_weight_index; @@ -284,15 +289,15 @@ struct scalar_group_info { // retrieve group info of centroid inputs by group index template struct centroid_group_info { - double const* cumulative_weights; - GroupOffsetsIter outer_offsets; - size_type const* inner_offsets; + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupOffsetsIter group_offsets; + size_type const* tdigest_offsets; __device__ thrust::tuple operator()(size_type group_index) const { // if there's no weights in this group of digests at all, return 0. - auto const group_start = inner_offsets[outer_offsets[group_index]]; - auto const group_end = inner_offsets[outer_offsets[group_index + 1]]; + auto const group_start = tdigest_offsets[group_offsets[group_index]]; + auto const group_end = tdigest_offsets[group_offsets[group_index + 1]]; auto const num_weights = group_end - group_start; auto const last_weight_index = group_end - 1; return num_weights == 0 @@ -367,7 +372,6 @@ std::unique_ptr to_tdigest_scalar(std::unique_ptr&& tdigest, * @param group_num_clusters Output. The number of output clusters for each input group. * @param group_cluster_offsets Offsets per-group to the start of it's clusters * @param has_nulls Whether or not the input contains nulls - * */ template @@ -661,6 +665,10 @@ std::unique_ptr build_output_column(size_type num_rows, mr); } +/** + * @brief A functor which returns the cluster index within a group that the value at + * the given value index falls into. + */ template struct compute_tdigests_keys_fn { int const delta; @@ -706,8 +714,8 @@ struct compute_tdigests_keys_fn { * boundaries. * * @param delta tdigest compression level - * @param values_begin Beginning of the range of input values. - * @param values_end End of the range of input values. + * @param centroids_begin Beginning of the range of centroids. + * @param centroids_end End of the range of centroids. * @param cumulative_weight Functor which returns cumulative weight and group information for * an absolute input value index. * @param min_col Column containing the minimum value per group. @@ -750,7 +758,9 @@ std::unique_ptr compute_tdigests(int delta, // double // max // } // - if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (total_clusters == 0) { + return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); + } // each input group represents an individual tdigest. within each tdigest, we want the keys // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall @@ -983,38 +993,54 @@ struct typed_reduce_tdigest { } }; -// utility for merge_tdigests. +/** + * @brief Functor to compute the number of clusters in each group. + * + * Used in `merge_tdigests`. + */ template -struct group_num_weights_func { - GroupOffsetsIter outer_offsets; - size_type const* inner_offsets; +struct group_num_clusters_func { + GroupOffsetsIter group_offsets; + size_type const* tdigest_offsets; __device__ size_type operator()(size_type group_index) { - auto const tdigest_begin = outer_offsets[group_index]; - auto const tdigest_end = outer_offsets[group_index + 1]; - return inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + auto const tdigest_begin = group_offsets[group_index]; + auto const tdigest_end = group_offsets[group_index + 1]; + return tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin]; } }; -// utility for merge_tdigests. +/** + * @brief Function to determine if a group is empty. + * + * Used in `merge_tdigests`. + */ struct group_is_empty { __device__ bool operator()(size_type group_size) { return group_size == 0; } }; -// utility for merge_tdigests. +/** + * @brief Functor that returns the grouping key for each tdigest cluster. + * + * Used in `merge_tdigests`. + */ template struct group_key_func { GroupLabelsIter group_labels; - size_type const* inner_offsets; - size_type num_inner_offsets; + size_type const* tdigest_offsets; + size_type num_tdigest_offsets; + /** + * @brief Returns the group index for an absolute cluster index. The index `n` is the index of the + * `n`-th non-empty cluster. + */ __device__ size_type operator()(size_type index) { // what -original- tdigest index this absolute index corresponds to - auto const iter = thrust::prev( - thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index)); - auto const tdigest_index = thrust::distance(inner_offsets, iter); + auto const iter = thrust::prev(thrust::upper_bound( + thrust::seq, tdigest_offsets, tdigest_offsets + num_tdigest_offsets, index)); + auto const tdigest_index = thrust::distance(tdigest_offsets, iter); // what group index the original tdigest belongs to return group_labels[tdigest_index]; @@ -1040,8 +1066,8 @@ std::pair, rmm::device_uvector> generate_mer // each group represents a collection of tdigest columns. each row is 1 tdigest. // within each group, we want to sort all the centroids within all the tdigests - // in that group, using the means as the key. the "outer offsets" represent the indices of the - // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest. + // in that group, using the means as the key. the "group offsets" represent the indices of the + // tdigests, and the "tdigest offsets" represents the list of centroids for a particular tdigest. // // rows // ---- centroid 0 --------- @@ -1054,12 +1080,12 @@ std::pair, rmm::device_uvector> generate_mer // tdigest 3 centroid 7 // centroid 8 // ---- centroid 9 -------- - auto inner_offsets = tdv.centroids().offsets(); + auto tdigest_offsets = tdv.centroids().offsets(); auto centroid_offsets = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type( - [group_offsets, inner_offsets = tdv.centroids().offsets().begin()] __device__( - size_type i) { return inner_offsets[group_offsets[i]]; })); + [group_offsets, tdigest_offsets = tdv.centroids().offsets().begin()] __device__( + size_type i) { return tdigest_offsets[group_offsets[i]]; })); // perform the sort using the means as the key size_t temp_size; @@ -1091,9 +1117,34 @@ std::pair, rmm::device_uvector> generate_mer return {std::move(output_means), std::move(output_weights)}; } +/** + * @brief Perform a merge aggregation of tdigests. This function usually takes the input as the + * outputs of multiple `typed_group_tdigest` calls, and merges them. + * + * A tdigest can be empty in the input, which means that there was no valid input data to generate + * it. These empty tdigests will have no centroids (means or weights) and will have a `min` and + * `max` of 0. + * + * @param tdv input tdigests. The tdigests within this column are grouped by key. + * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is + * counted as one even when the cluster is empty in it. The offsets should have the same values as + * the ones in `group_offsets`. + * @param group_offsets a device iterator of the offsets to the start of each group. A group is + * counted as one even when the cluster is empty in it. The offsets should have the same values as + * the ones in `h_group_offsets`. + * @param group_labels a device iterator of the the group label for each tdigest cluster including + * empty clusters. + * @param num_group_labels the number of unique group labels. + * @param num_groups the number of groups. + * @param max_centroids the maximum number of centroids (clusters) in the output (merged) tdigest. + * @param stream CUDA stream + * @param mr device memory resource + * + * @return A column containing the merged tdigests. + */ template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, - HGroupOffsetIter h_outer_offsets, + HGroupOffsetIter h_group_offsets, GroupOffsetIter group_offsets, GroupLabelIter group_labels, size_t num_group_labels, @@ -1133,22 +1184,24 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, thrust::equal_to{}, // key equality check thrust::maximum{}); + auto tdigest_offsets = tdv.centroids().offsets(); + // for any empty groups, set the min and max to be 0. not technically necessary but it makes // testing simpler. - auto group_num_weights = cudf::detail::make_counting_transform_iterator( + auto group_num_clusters = cudf::detail::make_counting_transform_iterator( 0, - group_num_weights_func{group_offsets, - tdv.centroids().offsets().begin()}); + group_num_clusters_func{group_offsets, + tdigest_offsets.begin()}); thrust::replace_if(rmm::exec_policy(stream), merged_min_col->mutable_view().begin(), merged_min_col->mutable_view().end(), - group_num_weights, + group_num_clusters, group_is_empty{}, 0); thrust::replace_if(rmm::exec_policy(stream), merged_max_col->mutable_view().begin(), merged_max_col->mutable_view().end(), - group_num_weights, + group_num_clusters, group_is_empty{}, 0); @@ -1166,14 +1219,13 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, // generate group keys for all centroids in the entire column rmm::device_uvector group_keys(num_centroids, stream, temp_mr); - auto iter = thrust::make_counting_iterator(0); - auto inner_offsets = tdv.centroids().offsets(); + auto iter = thrust::make_counting_iterator(0); thrust::transform(rmm::exec_policy(stream), iter, iter + num_centroids, group_keys.begin(), group_key_func{ - group_labels, inner_offsets.begin(), inner_offsets.size()}); + group_labels, tdigest_offsets.begin(), tdigest_offsets.size()}); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), group_keys.begin(), group_keys.begin() + num_centroids, @@ -1182,20 +1234,24 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto const delta = max_centroids; + // TDigest merge takes the output of typed_group_tdigest as its input, which must not have + // any nulls. + auto const has_nulls = false; + // generate cluster info auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( delta, num_groups, nearest_value_centroid_weights{ - cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + cumulative_weights.begin(), group_offsets, tdigest_offsets.begin()}, centroid_group_info{ - cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + cumulative_weights.begin(), group_offsets, tdigest_offsets.begin()}, cumulative_centroid_weight{ cumulative_weights.begin(), group_labels, group_offsets, - {inner_offsets.begin(), static_cast(inner_offsets.size())}}, - false, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + has_nulls, stream, mr); @@ -1212,13 +1268,13 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, cumulative_weights.begin(), group_labels, group_offsets, - {inner_offsets.begin(), static_cast(inner_offsets.size())}}, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, std::move(merged_min_col), std::move(merged_max_col), group_cluster_wl, std::move(group_cluster_offsets), total_clusters, - false, + has_nulls, stream, mr); } @@ -1283,7 +1339,7 @@ std::unique_ptr group_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); } auto const delta = max_centroids; return cudf::type_dispatcher(col.type(), @@ -1309,7 +1365,15 @@ std::unique_ptr group_merge_tdigest(column_view const& input, tdigest_column_view tdv(input); if (num_groups == 0 || input.size() == 0) { - return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); + return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); + } + + if (tdv.means().size() == 0) { + // `group_merge_tdigest` takes the output of `typed_group_tdigest` as its input, which wipes + // out the means and weights for empty clusters. Thus, no mean here indicates that all clusters + // are empty in the input. Let's skip all complex computation in the below, but just return + // an empty tdigest per group. + return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr); } // bring group offsets back to the host diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh index 6bc8b48832f..cd9fade164a 100644 --- a/cpp/src/reductions/compound.cuh +++ b/cpp/src/reductions/compound.cuh @@ -18,13 +18,18 @@ #include #include +#include #include +#include #include #include #include #include +#include +#include + namespace cudf { namespace reduction { namespace compound { @@ -53,9 +58,17 @@ std::unique_ptr compound_reduction(column_view const& col, { auto const valid_count = col.size() - col.null_count(); + // All null input produces all null output + if (valid_count == 0 || + // Only care about ddof for standard deviation and variance right now + valid_count <= ddof && (std::is_same_v || + std::is_same_v)) { + auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr); + result->set_valid_async(false, stream); + return result; + } // reduction by iterator auto dcol = cudf::column_device_view::create(col, stream); - std::unique_ptr result; Op compound_op{}; if (!cudf::is_dictionary(col.type())) { @@ -63,25 +76,21 @@ std::unique_ptr compound_reduction(column_view const& col, auto it = thrust::make_transform_iterator( dcol->pair_begin(), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } else { auto it = thrust::make_transform_iterator( dcol->begin(), compound_op.template get_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } } else { auto it = thrust::make_transform_iterator( cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol, col.has_nulls()), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } - - // set scalar is valid - result->set_valid_async(col.null_count() < col.size(), stream); - return result; }; // @brief result type dispatcher for compound reduction (a.k.a. mean, var, std) @@ -137,6 +146,7 @@ struct element_type_dispatcher { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error); return cudf::type_dispatcher( output_dtype, result_type_dispatcher(), col, output_dtype, ddof, stream, mr); } diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 7c4c89bd3fb..51c6e765edd 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -35,7 +35,7 @@ namespace strings { namespace detail { namespace { // Bitmask of all operators -#define OPERATOR_MASK 0200 +enum { OPERATOR_MASK = 0200 }; enum OperatorType : int32_t { START = 0200, // Start, used for marker on stack LBRA_NC = 0203, // non-capturing group @@ -50,7 +50,7 @@ enum OperatorType : int32_t { COUNTED_LAZY = 0215, NOP = 0302, // No operation, internal use only }; -#define ITEM_MASK 0300 +enum { ITEM_MASK = 0300 }; static reclass cclass_w(CCLASS_W); // \w static reclass cclass_s(CCLASS_S); // \s diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index d8c1b50a94b..21708e48a25 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -126,6 +126,43 @@ std::unique_ptr findall(strings_column_view const& input, mr); } +namespace { +struct find_re_fn { + column_device_view d_strings; + + __device__ size_type operator()(size_type const idx, + reprog_device const prog, + int32_t const thread_idx) const + { + if (d_strings.is_null(idx)) { return 0; } + auto const d_str = d_strings.element(idx); + + auto const result = prog.find(thread_idx, d_str, d_str.begin()); + return result.has_value() ? result.value().first : -1; + } +}; +} // namespace + +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto results = make_numeric_column(data_type{type_to_id()}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + if (input.is_empty()) { return results; } + + auto d_results = results->mutable_view().data(); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); + auto const d_strings = column_device_view::create(input.parent(), stream); + launch_transform_kernel(find_re_fn{*d_strings}, *d_prog, d_results, input.size(), stream); + + return results; +} } // namespace detail // external API @@ -139,5 +176,14 @@ std::unique_ptr findall(strings_column_view const& input, return detail::findall(input, prog, stream, mr); } +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::find_re(input, prog, stream, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index a87ecb81b9d..997b0278fe2 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,9 @@ namespace nvtext { namespace detail { namespace { +// long strings threshold found with benchmarking +constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64; + /** * @brief Generate ngrams from strings column. * @@ -173,33 +177,39 @@ constexpr cudf::thread_index_type bytes_per_thread = 4; /** * @brief Counts the number of ngrams in each row of the given strings column * - * Each warp processes a single string. + * Each warp/thread processes a single string. * Formula is `count = max(0,str.length() - ngrams + 1)` * If a string has less than ngrams characters, its count is 0. */ CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + cudf::size_type tile_size, cudf::size_type* d_counts) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / tile_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { d_counts[str_idx] = 0; return; } + auto const d_str = d_strings.element(str_idx); + if (tile_size == 1) { + d_counts[str_idx] = cuda::std::max(0, (d_str.length() + 1 - ngrams)); + return; + } + namespace cg = cooperative_groups; auto const warp = cg::tiled_partition(cg::this_thread_block()); - auto const d_str = d_strings.element(str_idx); - auto const end = d_str.data() + d_str.size_bytes(); + auto const end = d_str.data() + d_str.size_bytes(); auto const lane_idx = warp.thread_rank(); cudf::size_type count = 0; for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end; - itr += cudf::detail::warp_size * bytes_per_thread) { + itr += tile_size * bytes_per_thread) { for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) { count += static_cast(cudf::strings::detail::is_begin_utf8_char(*s)); } @@ -256,19 +266,27 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Parameter ngrams should be an integer value of 2 or greater", std::invalid_argument); - auto const strings_count = input.size(); - if (strings_count == 0) { // if no strings, return an empty column - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (input.is_empty()) { // if no strings, return an empty column + return cudf::lists::detail::make_empty_lists_column( + cudf::data_type{cudf::type_id::STRING}, stream, mr); + } + if (input.size() == input.null_count()) { + return cudf::lists::detail::make_all_nulls_lists_column( + input.size(), cudf::data_type{cudf::type_id::STRING}, stream, mr); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto [offsets, total_ngrams] = [&] { - auto counts = rmm::device_uvector(input.size(), stream); - auto const num_blocks = cudf::util::div_rounding_up_safe( - static_cast(input.size()) * cudf::detail::warp_size, block_size); - count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + auto counts = rmm::device_uvector(input.size(), stream); + auto const avg_char_bytes = (input.chars_size(stream) / (input.size() - input.null_count())); + auto const tile_size = (avg_char_bytes < AVG_CHAR_BYTES_THRESHOLD) + ? 1 // thread per row + : cudf::detail::warp_size; // warp per row + auto const grid = cudf::detail::grid_1d( + static_cast(input.size()) * tile_size, block_size); + count_char_ngrams_kernel<<>>( + *d_strings, ngrams, tile_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); @@ -277,8 +295,8 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Insufficient number of characters in each string to generate ngrams"); character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets}; - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - generator, strings_count, total_ngrams, stream, mr); + auto [offsets_column, chars] = + cudf::strings::detail::make_strings_children(generator, input.size(), total_ngrams, stream, mr); auto output = cudf::make_strings_column( total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); @@ -368,7 +386,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto [offsets, total_ngrams] = [&] { auto counts = rmm::device_uvector(input.size(), stream); count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + *d_strings, ngrams, cudf::detail::warp_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 0efb881eb3e..c0af27a1748 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -30,7 +30,7 @@ namespace cudf::detail { namespace { // Simple kernel to copy between device buffers -CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) +CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx < n) { dst[idx] = src[idx]; } @@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea }; // namespace -void cuda_memcpy_async( +void cuda_memcpy_async_impl( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { if (kind == host_memory_kind::PINNED) { @@ -73,11 +73,4 @@ void cuda_memcpy_async( } } -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) -{ - cuda_memcpy_async(dst, src, size, kind, stream); - stream.synchronize(); -} - } // namespace cudf::detail diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 125b98c4a67..9d8e3cf2fa6 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -115,12 +115,19 @@ class fixed_pinned_pool_memory_resource { return !operator==(other); } - friend void get_property(fixed_pinned_pool_memory_resource const&, + // clang-tidy will complain about this function because it is completely + // unused at runtime and only exist for tag introspection by CCCL, so we + // ignore linting. This masks a real issue if we ever want to compile with + // clang, though, which is that the function will actually be compiled out by + // clang. If cudf were ever to try to support clang as a compile we would + // need to force the compiler to emit this symbol. The same goes for the + // other get_property definitions in this file. + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::device_accessible) noexcept { } - friend void get_property(fixed_pinned_pool_memory_resource const&, + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::host_accessible) noexcept { } @@ -235,7 +242,9 @@ class new_delete_memory_resource { bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); } + // NOLINTBEGIN friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {} + // NOLINTEND }; static_assert(cuda::mr::resource_with, diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp index d54f5677c4c..e52fffbd8c6 100644 --- a/cpp/src/utilities/logger.cpp +++ b/cpp/src/utilities/logger.cpp @@ -74,8 +74,10 @@ struct logger_wrapper { } // namespace -spdlog::logger& cudf::logger() +spdlog::logger& cudf::detail::logger() { static logger_wrapper wrapped{}; return wrapped.logger_; } + +spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9824c472b20..8c29182bfb5 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -82,7 +82,7 @@ class rmm_cuda_stream_pool : public cuda_stream_pool { return streams; } - std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } + [[nodiscard]] std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } }; /** diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b67d922d377..4596ec65ce7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -385,6 +385,8 @@ ConfigureTest( # * utilities tests ------------------------------------------------------------------------------- ConfigureTest( UTILITIES_TEST + utilities_tests/batched_memcpy_tests.cu + utilities_tests/batched_memset_tests.cu utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp @@ -395,7 +397,6 @@ ConfigureTest( utilities_tests/pinned_memory_tests.cpp utilities_tests/type_check_tests.cpp utilities_tests/type_list_tests.cpp - utilities_tests/batched_memset_tests.cu ) # ################################################################################################## diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 6b350c137d0..a4bde50a21e 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -378,7 +378,7 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression) auto expressions = std::list(); auto op = arithmetic_operator; - expressions.push_back(cudf::ast::operation(op, col_ref, col_ref)); + expressions.emplace_back(op, col_ref, col_ref); for (int64_t i = 0; i < depth_level - 1; i++) { if (i == depth_level - 2) { @@ -387,9 +387,9 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression) op = arithmetic_operator; } if (nested_left_tree) { - expressions.push_back(cudf::ast::operation(op, expressions.back(), col_ref)); + expressions.emplace_back(op, expressions.back(), col_ref); } else { - expressions.push_back(cudf::ast::operation(op, col_ref, expressions.back())); + expressions.emplace_back(op, col_ref, expressions.back()); } } return expressions; diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index 06e0d193d80..aa5b49567e6 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -557,7 +557,11 @@ auto NullOp_Result(cudf::column_view lhs, cudf::column_view rhs) std::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(lhs.size()), result.begin(), - [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut { + [&lhs_data = lhs_data, + &lhs_mask = lhs_mask, + &rhs_data = rhs_data, + &rhs_mask = rhs_mask, + &result_mask = result_mask](auto i) -> TypeOut { auto lhs_valid = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i); auto rhs_valid = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i); bool output_valid = lhs_valid or rhs_valid; diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h index c900c4c558c..ef1ccfccab5 100644 --- a/cpp/tests/binaryop/util/operation.h +++ b/cpp/tests/binaryop/util/operation.h @@ -48,7 +48,7 @@ struct Add { void>* = nullptr> OutT operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) + static_cast(rhs)); } }; @@ -72,7 +72,7 @@ struct Sub { void>* = nullptr> OutT operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) - static_cast(rhs)); } }; @@ -83,7 +83,7 @@ struct Mul { std::enable_if_t::value, void>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) * static_cast(rhs)); } @@ -100,7 +100,7 @@ struct Mul { std::enable_if_t<(cudf::is_duration_t::value && std::is_integral_v) || (cudf::is_duration_t::value && std::is_integral_v), void>* = nullptr> - OutT DurationProduct(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationProduct(LhsT x, RhsT y) const { return x * y; } @@ -112,7 +112,7 @@ struct Div { std::enable_if_t::value, void>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) / static_cast(rhs)); } @@ -128,7 +128,7 @@ struct Div { typename LhsT, typename RhsT, std::enable_if_t<(std::is_integral_v || cudf::is_duration()), void>* = nullptr> - OutT DurationDivide(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationDivide(LhsT x, RhsT y) const { return x / y; } @@ -191,33 +191,31 @@ struct FloorDiv { template struct Mod { - template ::type>)>* = nullptr> + template >)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) % static_cast(rhs)); } - template ::type, float>)>* = nullptr> + template < + typename OutT = TypeOut, + typename LhsT = TypeLhs, + typename RhsT = TypeRhs, + std::enable_if_t<(std::is_same_v, float>)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { return static_cast(fmod(static_cast(lhs), static_cast(rhs))); } template < - typename OutT = TypeOut, - typename LhsT = TypeLhs, - typename RhsT = TypeRhs, - std::enable_if_t<(std::is_same_v::type, double>)>* = - nullptr> + typename OutT = TypeOut, + typename LhsT = TypeLhs, + typename RhsT = TypeRhs, + std::enable_if_t<(std::is_same_v, double>)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { return static_cast(fmod(static_cast(lhs), static_cast(rhs))); @@ -326,7 +324,7 @@ struct LogBase { template struct PMod { - using CommonArgsT = typename std::common_type::type; + using CommonArgsT = std::common_type_t; TypeOut operator()(TypeLhs x, TypeRhs y) const { @@ -351,8 +349,8 @@ struct PyMod { TypeOut operator()(TypeLhs x, TypeRhs y) const { if constexpr (std::is_floating_point_v or std::is_floating_point_v) { - double x1 = static_cast(x); - double y1 = static_cast(y); + auto x1 = static_cast(x); + auto y1 = static_cast(y); return fmod(fmod(x1, y1) + y1, y1); } else { return ((x % y) + y) % y; diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index 14b4197de71..631f5150829 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -340,7 +340,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask) cudf::column moved_to{std::move(original)}; - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); verify_column_views(moved_to); @@ -359,7 +359,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask) cudf::column moved_to{std::move(original)}; verify_column_views(moved_to); - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); // Verify move diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 7c8729b6a77..4124f749012 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -73,44 +73,45 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong) using T = TypeParam; // make sure we span at least 2 warps - int num_els = 64; - - bool mask[] = {true, false, true, false, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); - - bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - lhs_v); - - bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, - rhs_v); - - bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, - 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - exp_v); + constexpr int num_els = 64; + + std::array mask{ + true, false, true, false, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); + + wrapper lhs_w( + {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper rhs_w( + {6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, + {true, true, true, true, true, true, false, false, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper expected_w( + {5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -318,19 +319,17 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); auto const rhs = cudf::test::make_type_param_vector({6, 6, 6, 6}); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v.begin()); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), rhs_v); + wrapper expected_w(expected.begin(), expected.end(), rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -340,20 +339,18 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - bool mask_v[] = {true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els, mask_v); + std::array mask{true, false, false, true}; + std::array mask_v{true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto const lhs = cudf::test::make_type_param_vector({5, 5, 5, 5}); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v.begin()); cudf::numeric_scalar rhs_w(6); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 6}); - wrapper expected_w(expected.begin(), expected.end(), lhs_v); + wrapper expected_w(expected.begin(), expected.end(), lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -363,16 +360,14 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); cudf::numeric_scalar rhs_w(6, false); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), mask); + wrapper expected_w(expected.begin(), expected.end(), mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -405,17 +400,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w({6, 6, 6, 6}, rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w({6, 6, 6, 6}, rhs_v.begin()); - wrapper expected_w({5, 6, 6, 5}, rhs_v); + wrapper expected_w({5, 6, 6, 5}, rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -425,17 +418,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w({5, 5, 5, 5}, lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w({5, 5, 5, 5}, lhs_v.begin()); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), true); - wrapper expected_w({5, 6, 6, 5}, lhs_v); + wrapper expected_w({5, 6, 6, 5}, lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -445,15 +436,13 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), false); - wrapper expected_w({5, 6, 6, 5}, mask); + wrapper expected_w({5, 6, 6, 5}, mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -483,9 +472,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -510,9 +499,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, false, true, false, true, false}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, false, true, false, true, false}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -538,8 +527,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(strings2, strings1, mask_w); @@ -565,9 +554,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar) std::vector h_string2{"aaa"}; cudf::string_scalar string2{h_string2[0], false}; - constexpr cudf::size_type mask_size = 6; - bool mask[] = {true, false, true, false, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + mask_size); + std::array mask{true, false, true, false, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(string1, string2, mask_w); @@ -652,9 +640,9 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn) cudf::test::dictionary_column_wrapper input2( h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(input1, input2, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); @@ -679,8 +667,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar) cudf::test::dictionary_column_wrapper input2( h_strings.begin(), h_strings.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(input2, input1, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index bebd3d25610..aef0d4ad78a 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -370,11 +371,12 @@ TEST_F(SliceStringTableTest, StringWithNulls) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); diff --git a/cpp/tests/copying/slice_tests.cuh b/cpp/tests/copying/slice_tests.cuh index a180740f143..1e037294527 100644 --- a/cpp/tests/copying/slice_tests.cuh +++ b/cpp/tests/copying/slice_tests.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -148,7 +148,7 @@ std::vector create_expected_tables(cudf::size_type num_cols, } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; @@ -163,13 +163,12 @@ inline std::vector create_expected_string_co for (unsigned long index = 0; index < indices.size(); index += 2) { if (not nullable) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1])); + result.emplace_back(strings.begin() + indices[index], strings.begin() + indices[index + 1]); } else { auto valids = cudf::detail::make_counting_transform_iterator( indices[index], [](auto i) { return i % 2 == 0; }); - result.push_back(cudf::test::strings_column_wrapper( - strings.begin() + indices[index], strings.begin() + indices[index + 1], valids)); + result.emplace_back( + strings.begin() + indices[index], strings.begin() + indices[index + 1], valids); } } @@ -184,16 +183,16 @@ inline std::vector create_expected_string_co std::vector result = {}; for (unsigned long index = 0; index < indices.size(); index += 2) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1], - validity.begin() + indices[index])); + result.emplace_back(strings.begin() + indices[index], + strings.begin() + indices[index + 1], + validity.begin() + indices[index]); } return result; } inline std::vector create_expected_string_tables( - std::vector const strings[2], + std::vector> const strings, std::vector const& indices, bool nullable) { @@ -216,7 +215,7 @@ inline std::vector create_expected_string_tables( } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index ee3e7da5e0f..b56b0f2d3f8 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -135,7 +136,7 @@ std::vector create_expected_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], + std::vector> const strings, std::vector const& splits, bool nullable) { @@ -144,8 +145,8 @@ std::vector create_expected_string_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], - std::vector const validity[2], + std::vector> const strings, + std::vector> const validity, std::vector const& splits) { std::vector indices = splits_to_indices(splits, strings[0].size()); @@ -627,11 +628,12 @@ void split_string_with_invalids(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -658,11 +660,12 @@ void split_empty_output_strings_column_value(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -684,9 +687,9 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector splits{2, 5, 9}; @@ -699,16 +702,17 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) EXPECT_NO_THROW(Split(empty_table, splits)); } - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), no_valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), no_valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); scols.push_back(sw[1].release()); cudf::table src_table(std::move(scols)); auto result = Split(src_table, splits); - std::vector validity_masks[2] = {std::vector(strings[0].size()), - std::vector(strings[0].size())}; + std::vector> validity_masks{std::vector(strings[0].size()), + std::vector(strings[0].size())}; std::generate( validity_masks[1].begin(), validity_masks[1].end(), [i = 0]() mutable { return i++ % 2 == 0; }); @@ -1913,9 +1917,9 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes) cudf::size_type start = 0; auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector> cols; @@ -2377,7 +2381,7 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall) { // internally, contiguous split chunks GPU work in 1MB contiguous copies // so the output buffer must be 1MB or larger. - EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); + EXPECT_THROW(auto _ = cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); } TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 13577c4d0ea..603edb27c7c 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -196,6 +196,136 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns), fixed_width_column_wrapper{766, 424, 623}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1969, 1970, 1970}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{12, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{31, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{23, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{762, 0, 929}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{976, 23, 987}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{675, 432, 234}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{766, 424, 623}); } template diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp index 5651a26f192..0783b4e5bbb 100644 --- a/cpp/tests/filling/sequence_tests.cpp +++ b/cpp/tests/filling/sequence_tests.cpp @@ -41,8 +41,7 @@ TYPED_TEST(SequenceTypedTestFixture, Incrementing) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init, step); @@ -58,8 +57,8 @@ TYPED_TEST(SequenceTypedTestFixture, Decrementing) cudf::size_type num_els = 10; - T expected[] = {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w( + {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}); auto result = cudf::sequence(num_els, init, step); @@ -75,8 +74,7 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput) cudf::size_type num_els = 0; - T expected[] = {}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({}); auto result = cudf::sequence(num_els, init, step); @@ -121,8 +119,7 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init); diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 749f4013013..a79b6a32916 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -127,8 +127,9 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) using LCW = cudf::test::lists_column_wrapper; cudf::test::fixed_width_column_wrapper keys{1, 1, 2, 2, 3, 3, 4, 4}; - bool const validity_mask[] = {true, false, false, true, true, true, false, false}; - LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, validity_mask}; + std::array const validity_mask{true, false, false, true, true, true, false, false}; + LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, + validity_mask.data()}; cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index 0cb5ee30a8b..e9c72293649 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -49,7 +49,7 @@ TYPED_TEST(groupby_mean_test, basic) { using V = TypeParam; using R = cudf::detail::target_type_t; - using RT = typename std::conditional(), int, double>::type; + using RT = std::conditional_t(), int, double>; cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; cudf::test::fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -114,7 +114,7 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values) { using V = TypeParam; using R = cudf::detail::target_type_t; - using RT = typename std::conditional(), int, double>::type; + using RT = std::conditional_t(), int, double>; cudf::test::fixed_width_column_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index baa59026b07..4ae5d06b214 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups) cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0}; int const delta = 1000; - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto b = cudf::type_dispatcher( static_cast(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto d = cudf::type_dispatcher( static_cast(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta); - auto e = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto e = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); @@ -507,3 +507,126 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } + +std::unique_ptr do_agg( + cudf::column_view key, + cudf::column_view val, + std::function()> make_agg) +{ + std::vector keys; + keys.push_back(key); + cudf::table_view const key_table(keys); + + cudf::groupby::groupby gb(key_table); + std::vector requests; + cudf::groupby::aggregation_request req; + req.values = val; + req.aggregations.push_back(make_agg()); + requests.push_back(std::move(req)); + + auto result = gb.aggregate(std::move(requests)); + + std::vector> result_columns; + for (auto&& c : result.first->release()) { + result_columns.push_back(std::move(c)); + } + + EXPECT_EQ(result.second.size(), 1); + EXPECT_EQ(result.second[0].results.size(), 1); + result_columns.push_back(std::move(result.second[0].results[0])); + + return std::make_unique(std::move(result_columns)); +} + +TEST_F(TDigestMergeTest, AllValuesAreNull) +{ + // The input must be sorted by the key. + // See `aggregate_result_functor::operator()` for details. + auto const keys = cudf::test::fixed_width_column_wrapper{{0, 0, 1, 1, 2}}; + auto const keys_view = cudf::column_view(keys); + auto val_elems = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + // All values are null + return false; + }); + auto const vals = cudf::test::fixed_width_column_wrapper{ + val_elems, val_elems + keys_view.size(), val_valids}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_computed_keys_view{expected_computed_keys}; + auto const expected_computed_vals = + cudf::tdigest::detail::make_empty_tdigests_column(expected_computed_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(), + compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_merged_keys_view{expected_merged_keys}; + auto const expected_merged_vals = + cudf::tdigest::detail::make_empty_tdigests_column(expected_merged_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view()); +} + +TEST_F(TDigestMergeTest, AllValuesInOneGroupIsNull) +{ + cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper vals{{10.0, 20.0, {}, {}, 30.0}, + {true, true, false, false, true}}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2, 3}}; + + cudf::test::fixed_width_column_wrapper expected_means{10, 20, 30}; + cudf::test::fixed_width_column_wrapper expected_weights{1, 1, 1}; + cudf::test::fixed_width_column_wrapper expected_offsets{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper expected_mins{10.0, 20.0, 0.0, 30.0}; + cudf::test::fixed_width_column_wrapper expected_maxes{10.0, 20.0, 0.0, 30.0}; + auto const expected_values = + cudf::tdigest::detail::make_tdigest_column(4, + std::make_unique(expected_means), + std::make_unique(expected_weights), + std::make_unique(expected_offsets), + std::make_unique(expected_mins), + std::make_unique(expected_maxes), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), merge_result->get_column(1).view()); +} diff --git a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp index 4fb8f78b558..0e68050f935 100644 --- a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,8 +22,6 @@ #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - using NumericTypesNoBools = cudf::test::Concat; diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index cc95c7a2f0f..8bc47c92c6b 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -23,8 +23,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA256HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA256HashTest, EmptyTable) diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index 330f07ac8e2..ef4b9dd9b8a 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -225,8 +225,8 @@ TEST_F(DLPackUntypedTests, UnsupportedBroadcast1DTensorFromDlpack) constexpr int ndim = 1; // Broadcasted (stride-0) 1D tensor auto const data = cudf::test::make_type_param_vector({1}); - int64_t shape[ndim] = {5}; - int64_t strides[ndim] = {0}; + int64_t shape[ndim] = {5}; // NOLINT + int64_t strides[ndim] = {0}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -248,8 +248,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStrided1DTensorFromDlpack) constexpr int ndim = 1; // Strided 1D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2}; - int64_t strides[ndim] = {2}; + int64_t shape[ndim] = {2}; // NOLINT + int64_t strides[ndim] = {2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -271,7 +271,7 @@ TEST_F(DLPackUntypedTests, UnsupportedImplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; + int64_t shape[ndim] = {2, 2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -293,8 +293,8 @@ TEST_F(DLPackUntypedTests, UnsupportedExplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor with explicit strides auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 1}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 1}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -316,8 +316,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStridedColMajor2DTensorFromDlpack) constexpr int ndim = 2; // Column major, but strided in fastest dimension auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4, 5, 6, 7, 8}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 4}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 4}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -465,8 +465,8 @@ TYPED_TEST(DLPackNumericTests, FromDlpackCpu) using T = TypeParam; auto const data = cudf::test::make_type_param_vector({0, 1, 2, 3, 4, 0, 5, 6, 7, 8, 0}); uint64_t const offset{sizeof(T)}; - int64_t shape[2] = {4, 2}; - int64_t strides[2] = {1, 5}; + int64_t shape[2] = {4, 2}; // NOLINT + int64_t strides[2] = {1, 5}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index a4dc7531765..2151ec6e22f 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -270,9 +270,9 @@ TEST_F(FromArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -414,9 +414,9 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType) { std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); cudf::table_view expected_table_view = expected_table.view(); diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index cbfa4911c3c..ef9936b214c 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -309,9 +309,9 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 81c406c0faf..6e742b9e4cf 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -52,7 +52,7 @@ std::unique_ptr get_cudf_table() .release()); auto col4 = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( {true, false, true, false, true}, {true, false, true, true, false}) .release()); @@ -339,9 +339,9 @@ TEST_F(FromArrowTest, DictionaryIndicesType) std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 51216a8512c..7ba586461dc 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -55,7 +55,7 @@ get_nanoarrow_cudf_table(cudf::size_type length) auto col4 = cudf::test::fixed_width_column_wrapper( test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), test_data.bool_data.end(), test_data.bool_validity.begin()) @@ -82,8 +82,8 @@ get_nanoarrow_cudf_table(cudf::size_type length) test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back( @@ -575,9 +575,9 @@ TEST_F(ToArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp index fc0ed6c9352..fcb4433b42e 100644 --- a/cpp/tests/interop/to_arrow_host_test.cpp +++ b/cpp/tests/interop/to_arrow_host_test.cpp @@ -436,9 +436,9 @@ TEST_F(ToArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 90ae12cdd90..a6aa4b22eca 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -90,7 +90,7 @@ std::pair, std::shared_ptr> get_table auto col4 = cudf::test::fixed_width_column_wrapper( int64_data.begin(), int64_data.end(), validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( bool_data.begin(), bool_data.end(), bool_validity.begin()) .release()); @@ -112,8 +112,8 @@ std::pair, std::shared_ptr> get_table cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( bool_data_validity.begin(), bool_data_validity.end())); columns.emplace_back( @@ -294,9 +294,9 @@ TEST_F(ToArrowTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -438,7 +438,7 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge) auto const schema = std::make_shared(schema_vector); auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); - std::vector const metadata = {{"a"}}; + std::vector const metadata = {{"a"}}; // NOLINT ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table)); } } diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 840cf263ed9..54262dc3b44 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -39,19 +39,19 @@ using cudf::device_span; */ template struct DecompressTest : public cudf::test::BaseFixture { - std::vector vector_from_string(char const* str) const + [[nodiscard]] std::vector vector_from_string(std::string const str) const { - return std::vector(reinterpret_cast(str), - reinterpret_cast(str) + strlen(str)); + return {reinterpret_cast(str.c_str()), + reinterpret_cast(str.c_str()) + strlen(str.c_str())}; } - void Decompress(std::vector* decompressed, + void Decompress(std::vector& decompressed, uint8_t const* compressed, size_t compressed_size) { auto stream = cudf::get_default_stream(); rmm::device_buffer src{compressed, compressed_size, stream}; - rmm::device_uvector dst{decompressed->size(), stream}; + rmm::device_uvector dst{decompressed.size(), stream}; cudf::detail::hostdevice_vector> inf_in(1, stream); inf_in[0] = {static_cast(src.data()), src.size()}; @@ -67,7 +67,7 @@ struct DecompressTest : public cudf::test::BaseFixture { static_cast(this)->dispatch(inf_in, inf_out, inf_stat); CUDF_CUDA_TRY(cudaMemcpyAsync( - decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); + decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); inf_stat.device_to_host_sync(stream); ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS); } @@ -125,49 +125,57 @@ struct NvcompConfigTest : public cudf::test::BaseFixture {}; TEST_F(GzipDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0x1f, 0x8b, 0x8, 0x0, 0x9, 0x63, 0x99, 0x5c, 0x2, 0xff, 0xcb, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0x1, 0x0, 0x85, 0x11, 0x4a, 0xd, 0xb, 0x0, 0x0, 0x0}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x28, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, ShortLiteralAfterLongCopyAtStartup) { - constexpr char uncompressed[] = "Aaaaaaaaaaaah!"; + std::string const uncompressed{"Aaaaaaaaaaaah!"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = {14, 0x0, 'A', 0x0, 'a', (10 - 4) * 4 + 1, 1, 0x4, 'h', '!'}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(BrotliDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x5, 0x80, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x3}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index dc14824d834..b265dcf9273 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -63,9 +63,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using column = cudf::column; using table = cudf::table; using table_view = cudf::table_view; @@ -954,7 +954,7 @@ TEST_F(CsvReaderTest, Strings) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}, + std::vector{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}, view.column(1)); } @@ -1014,7 +1014,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"}, + std::vector{"\"abcdef ghi\"", R"("jkl ""mno"" pqr")", "stu \"vwx\" yz"}, view.column(1)); } @@ -1830,7 +1830,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) auto int_column = column_wrapper{10, 20, 30}; auto string_column = - column_wrapper{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}; + column_wrapper{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}; cudf::table_view input_table(std::vector{int_column, string_column}); // TODO add quoting style flag? @@ -2516,4 +2516,39 @@ TEST_F(CsvReaderTest, UTF8BOM) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected); } +void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs) +{ + ASSERT_EQ(lhs->size(), rhs->size()); + EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size())); +} + +TEST_F(CsvReaderTest, OutOfMapBoundsReads) +{ + // write a lot of data into a file + auto filepath = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv"; + auto const num_rows = 1 << 20; + auto const row = std::string{"0,1,2,3,4,5,6,7,8,9\n"}; + auto const file_size = num_rows * row.size(); + { + std::ofstream outfile(filepath, std::ofstream::out); + for (size_t i = 0; i < num_rows; ++i) { + outfile << row; + } + } + + // Only memory map the middle of the file + auto source = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4); + auto full_source = cudf::io::datasource::create(filepath); + auto const all_data = source->host_read(0, file_size); + auto ref_data = full_source->host_read(0, file_size); + expect_buffers_equal(ref_data.get(), all_data.get()); + + auto const start_data = source->host_read(file_size / 2, file_size / 2); + expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(), + start_data.get()); + + auto const end_data = source->host_read(0, file_size / 2 + 512); + expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 68ec255b39d..cb6716f4a18 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -68,9 +68,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; cudf::test::TempDirTestEnvironment* const temp_env = static_cast( @@ -858,8 +858,7 @@ TEST_P(JsonReaderRecordTest, JsonLinesObjects) TEST_P(JsonReaderRecordTest, JsonLinesObjectsStrings) { - auto const test_opt = GetParam(); - auto test_json_objects = [test_opt](std::string const& data) { + auto test_json_objects = [](std::string const& data) { cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .lines(true); @@ -2575,6 +2574,30 @@ TEST_F(JsonReaderTest, ViableDelimiter) EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument); } +TEST_F(JsonReaderTest, ViableDelimiterNewlineWS) +{ + // Test input + std::string input = R"({"a": + 100})"; + + cudf::io::json_reader_options json_parser_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()}) + .lines(true) + .delimiter('\0'); + + auto result = cudf::io::read_json(json_parser_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 1); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + + auto col1_iterator = thrust::constant_iterator(100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper(col1_iterator, col1_iterator + 1)); +} + // Test case for dtype prune: // all paths, only one. // one present, another not present, nothing present diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp index 2c4e29a01b9..39d31c406a5 100644 --- a/cpp/tests/io/json/json_writer.cpp +++ b/cpp/tests/io/json/json_writer.cpp @@ -70,6 +70,43 @@ TEST_F(JsonWriterTest, EmptyInput) EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); } +TEST_F(JsonWriterTest, EmptyLeaf) +{ + cudf::test::strings_column_wrapper col1{""}; + cudf::test::fixed_width_column_wrapper offsets{0, 0}; + auto col2 = make_lists_column(1, + offsets.release(), + cudf::test::strings_column_wrapper{}.release(), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); + auto col3 = cudf::test::lists_column_wrapper::make_one_empty_row_column(); + cudf::table_view tbl_view{{col1, *col2, col3}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + + // Empty columns in table + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + + // Empty columns in table - JSON Lines + out_buffer.clear(); + out_options.enable_lines(true); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})" + "\n"; + EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); +} + TEST_F(JsonWriterTest, ErrorCases) { cudf::test::strings_column_wrapper col1{"a", "b", "c"}; diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 327169ae563..f32aba0e632 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter) } } +TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) +{ + // Test input. Inline comments used to indicate character indexes + // 012345678 <= line 0 + char const delimiter = GetParam(); + + /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as + * a delimiter for JSONL) + * {"a":2} + * {"a":{"a":{"a":[321{"a":[1]} + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + std::string input = R"({"a":2})" + "\n"; + // starting position 8 (zero indexed) + input += R"({"a":)" + std::string(1, delimiter); + // starting position 14 (zero indexed) + input += R"({"a":{"a":[321)" + std::string(1, delimiter); + // starting position 29 (zero indexed) + input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter); + // starting position 41 (zero indexed) + input += R"({"b":123})" + "\n"; + // starting position 51 (zero indexed) + input += R"({"b":123})" + std::string(1, delimiter); + // starting position 61 (zero indexed) + input += R"({"b")" + std::string("\n:\n\n\n123\n}"); + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> golden_token_stream; + if (delimiter != '\n') { + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 3 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 4 (valid) + {61, token_t::StructBegin}, + {62, token_t::StructMemberBegin}, + {62, token_t::FieldNameBegin}, + {64, token_t::FieldNameEnd}, + {70, token_t::ValueBegin}, + {73, token_t::ValueEnd}, + {74, token_t::StructMemberEnd}, + {74, token_t::StructEnd}}; + } else { + /* Input: + * {"a":2} + * {"a": + * {"a":{"a":[321 + * {"a":[1]} + * + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 3 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 4 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 5 (valid) + {51, token_t::StructBegin}, + {52, token_t::StructMemberBegin}, + {52, token_t::FieldNameBegin}, + {54, token_t::FieldNameEnd}, + {56, token_t::ValueBegin}, + {59, token_t::ValueEnd}, + {59, token_t::StructMemberEnd}, + {59, token_t::StructEnd}, + // Line 6 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}}; + } + + auto const stream = cudf::get_default_stream(); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + // Default parsing options + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{}) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .delimiter(delimiter) + .lines(true); + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, in_opts, stream, cudf::get_current_device_resource_ref()); + // Copy back the number of tokens that were written + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); + + stream.synchronize(); + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 39ba62952b4..cce0adbf317 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -38,13 +38,14 @@ #include +#include #include template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using str_col = column_wrapper; using bool_col = column_wrapper; @@ -767,14 +768,14 @@ TEST_F(OrcChunkedWriterTest, Metadata) TEST_F(OrcChunkedWriterTest, Strings) { - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - str_col strings1(h_strings1.begin(), h_strings1.end(), mask1); + str_col strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); table_view tbl1({strings1}); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - str_col strings2(h_strings2.begin(), h_strings2.end(), mask2); + str_col strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); table_view tbl2({strings2}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -877,26 +878,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els{31}; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -920,26 +921,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -1140,7 +1141,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) } // Test with zstd compressed orc file with high compression ratio. - constexpr uint8_t input_buffer[] = { + constexpr std::array input_buffer{ 0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74, 0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04, 0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb, @@ -1154,7 +1155,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) 0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17}; auto source = - cudf::io::source_info(reinterpret_cast(input_buffer), sizeof(input_buffer)); + cudf::io::source_info(reinterpret_cast(input_buffer.data()), input_buffer.size()); cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(source).use_index(false); @@ -1357,21 +1358,22 @@ TEST_P(OrcWriterTestStripes, StripeSize) cols.push_back(col.release()); auto const expected = std::make_unique
(std::move(cols)); - auto validate = [&](std::vector const& orc_buffer) { - auto const expected_stripe_num = - std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); - auto const stats = cudf::io::read_parsed_orc_statistics( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); - EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); - - cudf::io::orc_reader_options in_opts = - cudf::io::orc_reader_options::builder( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) - .use_index(false); - auto result = cudf::io::read_orc(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - }; + auto validate = + [&, &size_bytes = size_bytes, &size_rows = size_rows](std::vector const& orc_buffer) { + auto const expected_stripe_num = + std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); + auto const stats = cudf::io::read_parsed_orc_statistics( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); + EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); + + cudf::io::orc_reader_options in_opts = + cudf::io::orc_reader_options::builder( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) + .use_index(false); + auto result = cudf::io::read_orc(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + }; { std::vector out_buffer_chunked; diff --git a/cpp/tests/io/parquet_chunked_writer_test.cpp b/cpp/tests/io/parquet_chunked_writer_test.cpp index 282c6f3adad..810fee89c48 100644 --- a/cpp/tests/io/parquet_chunked_writer_test.cpp +++ b/cpp/tests/io/parquet_chunked_writer_test.cpp @@ -124,15 +124,15 @@ TEST_F(ParquetChunkedWriterTest, Strings) { std::vector> cols; - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); + cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); cols.push_back(strings1.release()); cudf::table tbl1(std::move(cols)); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); + cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); cols.push_back(strings2.release()); cudf::table tbl2(std::move(cols)); @@ -771,29 +771,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els = 31; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + true, true, true, true, true, true, true, true, true}; + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); @@ -819,29 +819,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 3dd5ad145ea..6141a40bc95 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -483,10 +483,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> ascending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", i); - return std::string(buf); + sprintf(buf.data(), "%09d", i); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -495,10 +495,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> descending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", num_ordered_rows - i); - return std::string(buf); + sprintf(buf.data(), "%09d", static_cast(num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -507,10 +507,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> unordered() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); - return std::string(buf); + sprintf(buf.data(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index bc6145d77da..bd1579eaa1b 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -35,9 +35,9 @@ template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using column = cudf::column; using table = cudf::table; using table_view = cudf::table_view; diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index 01027d04658..f1286a00d22 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -23,6 +23,8 @@ #include #include +#include + //////////////////////////////// // delta encoding writer tests @@ -96,7 +98,7 @@ TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) // list constexpr int vals_per_row = 4; auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( - 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + 0, [](cudf::size_type idx) { return idx * vals_per_row; }); cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, c1_offset_iter + num_rows + 1); cudf::test::fixed_width_column_wrapper c1_vals( @@ -225,10 +227,9 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { - cudf::io::parquet::detail::BoundaryOrder::ASCENDING, - cudf::io::parquet::detail::BoundaryOrder::DESCENDING, - cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; + std::array expected_orders{cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 6c61535359f..4a5309f3ba7 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -29,6 +29,8 @@ #include #include +#include + TEST_F(ParquetReaderTest, UserBounds) { // trying to read more rows than there are should result in @@ -569,7 +571,8 @@ TEST_F(ParquetReaderTest, DecimalRead) This test is a temporary test until python gains the ability to write decimal, so we're embedding a parquet file directly into the code here to prevent issues with finding the file */ - unsigned char const decimals_parquet[] = { + constexpr unsigned int decimals_parquet_len = 2366; + std::array const decimals_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00, @@ -728,10 +731,10 @@ TEST_F(ParquetReaderTest, DecimalRead) 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - unsigned int decimals_parquet_len = 2366; - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(decimals_parquet), decimals_parquet_len}); + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(decimals_parquet.data()), decimals_parquet_len}); auto result = cudf::io::read_parquet(read_opts); auto validity = @@ -739,7 +742,7 @@ TEST_F(ParquetReaderTest, DecimalRead) EXPECT_EQ(result.tbl->view().num_columns(), 3); - int32_t col0_data[] = { + std::array col0_data{ -2354584, -190275, 8393572, 6446515, -5687920, -1843550, -6897687, -6780385, 3428529, 5842056, -4312278, -4450603, -7516141, 2974667, -4288640, 1065090, -9410428, 7891355, 1076244, -1975984, 6999466, 2666959, 9262967, 7931374, -1370640, 451074, 8799111, @@ -753,29 +756,28 @@ TEST_F(ParquetReaderTest, DecimalRead) std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - int64_t col1_data[] = {29274040266581, -17210335917753, -58420730139037, - 68073792696254, 2236456014294, 13704555677045, - -70797090469548, -52248605513407, -68976081919961, - -34277313883112, 97774730521689, 21184241014572, - -670882460254, -40862944054399, -24079852370612, - -88670167797498, -84007574359403, -71843004533519, - -55538016554201, 3491435293032, -29085437167297, - 36901882672273, -98622066122568, -13974902998457, - 86712597643378, -16835133643735, -94759096142232, - 30708340810940, 79086853262082, 78923696440892, - -76316597208589, 37247268714759, 80303592631774, - 57790350050889, 19387319851064, -33186875066145, - 69701203023404, -7157433049060, -7073790423437, - 92769171617714, -75127120182184, -951893180618, - 64927618310150, -53875897154023, -16168039035569, - -24273449166429, -30359781249192, 35639397345991, - 45844829680593, 71401416837149, 0, - -99999999999999, 99999999999999}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + std::array col1_data{29274040266581, -17210335917753, -58420730139037, + 68073792696254, 2236456014294, 13704555677045, + -70797090469548, -52248605513407, -68976081919961, + -34277313883112, 97774730521689, 21184241014572, + -670882460254, -40862944054399, -24079852370612, + -88670167797498, -84007574359403, -71843004533519, + -55538016554201, 3491435293032, -29085437167297, + 36901882672273, -98622066122568, -13974902998457, + 86712597643378, -16835133643735, -94759096142232, + 30708340810940, 79086853262082, 78923696440892, + -76316597208589, 37247268714759, 80303592631774, + 57790350050889, 19387319851064, -33186875066145, + 69701203023404, -7157433049060, -7073790423437, + 92769171617714, -75127120182184, -951893180618, + 64927618310150, -53875897154023, -16168039035569, + -24273449166429, -30359781249192, 35639397345991, + 45844829680593, 71401416837149, 0, + -99999999999999, 99999999999999}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5}); + col1_data.begin(), col1_data.end(), validity, numeric::scale_type{-5}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); cudf::io::parquet_reader_options read_strict_opts = read_opts; @@ -786,7 +788,7 @@ TEST_F(ParquetReaderTest, DecimalRead) // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4) // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6) // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9) - unsigned char const fixed_len_bytes_decimal_parquet[] = { + std::array const fixed_len_bytes_decimal_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, 0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00, @@ -875,75 +877,72 @@ TEST_F(ParquetReaderTest, DecimalRead) cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ - reinterpret_cast(fixed_len_bytes_decimal_parquet), parquet_len}); + reinterpret_cast(fixed_len_bytes_decimal_parquet.data()), parquet_len}); auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().num_columns(), 3); - auto validity_c0 = cudf::test::iterators::nulls_at({19}); - int32_t col0_data[] = {6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, - 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, - 2282579, 7521455, 4430706, 1937859, 4532040, 0}; + auto validity_c0 = cudf::test::iterators::nulls_at({19}); + std::array col0_data{6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, + 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, + 2282579, 7521455, 4430706, 1937859, 4532040, 0}; - EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), - sizeof(col0_data) / sizeof(col0_data[0])); + EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), col0_data.size()); cudf::test::fixed_point_column_wrapper col0( - std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3}); + col0_data.begin(), col0_data.end(), validity_c0, numeric::scale_type{-3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - auto validity_c1 = cudf::test::iterators::nulls_at({18}); - int64_t col1_data[] = {361378026250, - 30646804862, - 429930238629, - 418758703536, - 895494171113, - 435283865083, - 809096053722, - -999999999999, - 426465099333, - 526684574144, - 826310892810, - 584686967589, - 113822282951, - 409236212092, - 420631167535, - 918438386086, - -999999999999, - 489053889147, - 0, - 363993164092}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + auto validity_c1 = cudf::test::iterators::nulls_at({18}); + std::array col1_data{361378026250, + 30646804862, + 429930238629, + 418758703536, + 895494171113, + 435283865083, + 809096053722, + -999999999999, + 426465099333, + 526684574144, + 826310892810, + 584686967589, + 113822282951, + 409236212092, + 420631167535, + 918438386086, + -999999999999, + 489053889147, + 0, + 363993164092}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11}); + col1_data.begin(), col1_data.end(), validity_c1, numeric::scale_type{-11}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); - auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); - __int128_t col2_data[] = {9078697037144433659, - 9050770539577117612, - 2358363961733893636, - 1566059559232276662, - 6658306200002735268, - 4967909073046397334, - 0, - 7235588493887532473, - 5023160741463849572, - 2765173712965988273, - 3880866513515749646, - 5019704400576359500, - 5544435986818825655, - 7265381725809874549, - 0, - 1576192427381240677, - 2828305195087094598, - 260308667809395171, - 2460080200895288476, - 2718441925197820439}; - - EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), - sizeof(col2_data) / sizeof(col2_data[0])); + auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); + std::array<__int128_t, 20> col2_data{9078697037144433659, + 9050770539577117612, + 2358363961733893636, + 1566059559232276662, + 6658306200002735268, + 4967909073046397334, + 0, + 7235588493887532473, + 5023160741463849572, + 2765173712965988273, + 3880866513515749646, + 5019704400576359500, + 5544435986818825655, + 7265381725809874549, + 0, + 1576192427381240677, + 2828305195087094598, + 260308667809395171, + 2460080200895288476, + 2718441925197820439}; + + EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), col2_data.size()); cudf::test::fixed_point_column_wrapper<__int128_t> col2( - std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1}); + col2_data.begin(), col2_data.end(), validity_c2, numeric::scale_type{-1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), col2); } } @@ -1190,15 +1189,12 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) cudf::test::fixed_width_column_wrapper values(value_iter, value_iter + num_values, validity); // ~256k values with num_nesting_levels = 16 - int total_values_produced = num_values; - auto prev_col = values.release(); + auto prev_col = values.release(); for (int idx = 0; idx < num_nesting_levels; idx++) { - auto const depth = num_nesting_levels - idx; auto const num_rows = (1 << (num_nesting_levels - idx)); auto offsets_iter = cudf::detail::make_counting_transform_iterator( - 0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; }); - total_values_produced += (num_rows + 1); + 0, [](cudf::size_type i) { return i * rows_per_level; }); cudf::test::fixed_width_column_wrapper offsets(offsets_iter, offsets_iter + num_rows + 1); @@ -1221,7 +1217,7 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) TEST_F(ParquetReaderTest, SingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1239,7 +1235,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) // read single level list reproducing parquet file cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)}); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()}); auto table = cudf::io::read_parquet(read_opts); auto const c0 = table.tbl->get_column(0); @@ -1252,7 +1248,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1271,7 +1267,7 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) auto reader = cudf::io::chunked_parquet_reader( 1L << 31, cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)})); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()})); int iterations = 0; while (reader.has_next() && iterations < 10) { auto chunk = reader.read_chunk(); @@ -1932,7 +1928,7 @@ TEST_F(ParquetReaderTest, FilterFloatNAN) TEST_F(ParquetReaderTest, RepeatedNoAnnotations) { - constexpr unsigned char repeated_bytes[] = { + constexpr std::array repeated_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, @@ -1976,9 +1972,9 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations) 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - auto read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); - auto result = cudf::io::read_parquet(read_opts); + auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(repeated_bytes.data()), repeated_bytes.size()}); + auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().column(0).size(), 6); EXPECT_EQ(result.tbl->view().num_columns(), 2); diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 9e66fc9409f..a0b48f54854 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -23,6 +23,8 @@ #include +#include + using cudf::test::iterators::no_nulls; // Base test fixture for V2 header tests @@ -693,9 +695,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -715,9 +717,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -787,9 +789,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -819,9 +821,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids); @@ -897,9 +899,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -914,9 +916,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -1034,7 +1036,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) // hard coded schema indices. // TODO find a way to do this without magic - size_t const colidxs[] = {1, 3, 4, 5, 8}; + constexpr std::array colidxs{1, 3, 4, 5, 8}; for (size_t r = 0; r < fmd.row_groups.size(); r++) { auto const& rg = fmd.row_groups[r]; for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1129,7 +1131,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) // col1 will have num_ordered_rows / 2 nulls total // col2 will have num_ordered_rows / 3 nulls total // col3 will have num_ordered_rows / 4 nulls total - int const null_mods[] = {0, 2, 3, 4}; + constexpr std::array null_mods{0, 2, 3, 4}; for (auto const& rg : fmd.row_groups) { for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1299,25 +1301,25 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); - int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 5, 11}; - std::vector const expected_def_hists[] = {{1, 1, 2, 3}, - {1, 3, 10}, - {1, 1, 2, 10}, - {1, 1, 2, 2, 8}, - {1, 1, 1, 1, 10}, - {1, 1, 1, 1, 2, 8}, - {1, 3, 9}, - {1, 3, 1, 8}, - {1, 0, 4, 1, 1, 4, 9}}; - std::vector const expected_rep_hists[] = {{4, 3}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 5}, - {4, 4, 5}, - {4, 6, 2, 8}}; + std::array expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11}; + std::vector> const expected_def_hists = {{1, 1, 2, 3}, + {1, 3, 10}, + {1, 1, 2, 10}, + {1, 1, 2, 2, 8}, + {1, 1, 1, 1, 10}, + {1, 1, 1, 1, 2, 8}, + {1, 3, 9}, + {1, 3, 1, 8}, + {1, 0, 4, 1, 1, 4, 9}}; + std::vector> const expected_rep_hists = {{4, 3}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 5}, + {4, 4, 5}, + {4, 6, 2, 8}}; auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet"); auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index c8100038942..6c5e9cdf07a 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -31,6 +31,7 @@ #include #include +#include #include using cudf::test::iterators::no_nulls; @@ -289,7 +290,8 @@ class custom_test_data_sink : public cudf::io::data_sink { CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); } - ~custom_test_data_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~custom_test_data_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -879,53 +881,52 @@ TEST_F(ParquetWriterTest, Decimal128Stats) TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) { - char const* coldata[] = { - // in-range 7 bit. should truncate to "yyyyyyyz" - "yyyyyyyyy", - // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's - // considered binary, not UTF-8. If UTF-8 it should not truncate. - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - // max binary. this should not truncate - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" - "ééééé", - // max 2-byte UTF8 (U+07FF). should not truncate - "߿߿߿߿߿", - // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" - "ࠀࠀࠀ", - // max 3-byte UTF8 (U+FFFF). should not truncate - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" - "𐀀𐀀𐀀", - // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, - // which is no longer valid unicode, but is still ok UTF-8??? - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - // max 4-byte UTF8 (U+1FFFFF). should not truncate - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array coldata{// in-range 7 bit. should truncate to "yyyyyyyz" + "yyyyyyyyy", + // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's + // considered binary, not UTF-8. If UTF-8 it should not truncate. + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + // max binary. this should not truncate + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" + "ééééé", + // max 2-byte UTF8 (U+07FF). should not truncate + "߿߿߿߿߿", + // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" + "ࠀࠀࠀ", + // max 3-byte UTF8 (U+FFFF). should not truncate + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" + "𐀀𐀀𐀀", + // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, + // which is no longer valid unicode, but is still ok UTF-8??? + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + // max 4-byte UTF8 (U+1FFFFF). should not truncate + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger // than that will not become minimum value (when written as UTF-8). - char const* truncated_min[] = {"yyyyyyyy", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - "\xf7\xbf\xbf\xbf", - "éééé", - "߿߿߿߿", - "ࠀࠀ", - "\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀀", - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - "\xf7\xbf\xbf\xbf"}; - - char const* truncated_max[] = {"yyyyyyyz", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - "éééê", - "߿߿߿߿߿", - "ࠀࠁ", - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀁", - "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array truncated_min{"yyyyyyyy", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + "\xf7\xbf\xbf\xbf", + "éééé", + "߿߿߿߿", + "ࠀࠀ", + "\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀀", + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + "\xf7\xbf\xbf\xbf"}; + + std::array truncated_max{"yyyyyyyz", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + "éééê", + "߿߿߿߿߿", + "ࠀࠁ", + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀁", + "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; auto cols = [&]() { using string_wrapper = column_wrapper; @@ -981,13 +982,15 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) { - std::vector truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, - {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_min{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, + {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; - std::vector truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, - {0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_max{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, + {0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; cudf::test::lists_column_wrapper col0{ {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}}; diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 408d54bd5ff..74d08061df9 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -145,7 +145,7 @@ TEST_F(MultibyteSplitTest, LargeInput) for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { host_input += "...:|"; - host_expected.emplace_back(std::string("...:|")); + host_expected.emplace_back("...:|"); } auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()}; diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 93754091b3f..178edc52dd3 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -314,7 +314,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -362,7 +362,7 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -398,7 +398,7 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -423,7 +423,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}}; strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true}); @@ -468,7 +468,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; auto col0_gold_names_col = strcol_wrapper{ "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"}; diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index a9186874e83..42a574ac5c0 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -652,7 +652,7 @@ TEST_F(JsonPathTests, MixedOutput) // various queries on: // clang-format off std::vector input_strings { - "{\"a\": {\"b\" : \"c\"}}", + R"({"a": {"b" : "c"}})", "{" "\"a\": {\"b\" : \"c\"}," @@ -827,7 +827,7 @@ TEST_F(JsonPathTests, AllowSingleQuotes) // various queries on: std::vector input_strings{ // clang-format off - "{\'a\': {\'b\' : \'c\'}}", + R"({'a': {'b' : 'c'}})", "{" "\'a\': {\'b\' : \"c\"}," @@ -902,7 +902,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"item\" : [{\"key\" : \"value[\"}]}", + R"({"item" : [{"key" : "value["}]})", // clang-format on }; @@ -927,7 +927,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"a\" : \"[}{}][][{[\\\"}}[\\\"]\"}", + R"({"a" : "[}{}][][{[\"}}[\"]"})", // clang-format on }; @@ -958,8 +958,8 @@ TEST_F(JsonPathTests, EscapeSequences) std::vector input_strings{ // clang-format off - "{\"a\" : \"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\"}", - "{\"a\" : \"\\u1248 \\uacdf \\uACDF \\u10EF\"}" + R"({"a" : "\" \\ \/ \b \f \n \r \t"})", + R"({"a" : "\u1248 \uacdf \uACDF \u10EF"})" // clang-format on }; diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu index 80bde168b75..a212d7d654a 100644 --- a/cpp/tests/large_strings/json_tests.cu +++ b/cpp/tests/large_strings/json_tests.cu @@ -96,5 +96,5 @@ TEST_F(JsonLargeReaderTest, MultiBatch) } // go back to normal batch_size - unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); + unsetenv("LIBCUDF_JSON_BATCH_SIZE"); } diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index 97979e79010..bea044496b3 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -97,7 +97,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyColumns) "hi", "hj"}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -296,7 +296,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns) true, false, false}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 2e09f25b51f..6208d395f0a 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -349,7 +349,7 @@ TYPED_TEST(MergeTest_, Merge1KeyColumns) cudf::test::fixed_width_column_wrapper expectedDataWrap1(seq_out1, seq_out1 + outputRows); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -452,7 +452,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) cudf::size_type inputRows = 40; // data: 0 2 4 6 | valid: 1 1 1 0 - auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 0; // <- no shortcut to this can avoid compiler errors } else { @@ -465,7 +465,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) leftColWrap1(sequence1, sequence1 + inputRows, valid_sequence1); // data: 1 3 5 7 | valid: 1 1 1 0 - auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 1; } else diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 915717713df..37414eb3fba 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {}; TEST_F(PercentileApproxTest, EmptyInput) { - auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; std::vector input; diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 949ffcc26a6..bdb98372836 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -33,9 +33,12 @@ #include #include +#include #include +#include #include +#include #include using aggregation = cudf::aggregation; @@ -766,6 +769,25 @@ TYPED_TEST(MultiStepReductionTest, Mean) expected_value_nulls); } +template +double calc_var(std::vector const& v, int ddof, std::vector const& mask = {}) +{ + auto const values = [&]() { + if (mask.empty()) { return v; } + std::vector masked{}; + thrust::copy_if( + v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; }); + return masked; + }(); + auto const valid_count = values.size(); + double const mean = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count; + double const sq_sum_of_differences = + std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) { + return acc + std::pow(v - mean, 2); + }); + return sq_sum_of_differences / (valid_count - ddof); +} + // This test is disabled for only a Debug build because a compiler error // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu #ifdef NDEBUG @@ -778,25 +800,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [](std::vector& v, cudf::size_type valid_count, int ddof) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls std::vector v = convert_values(int_values); cudf::test::fixed_width_column_wrapper col(v.begin(), v.end()); auto const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -812,23 +821,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(v, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(v, host_bools, T{0}); - - double var_nulls = calc_var(replaced_array, valid_count, ddof); - double std_nulls = std::sqrt(var_nulls); + double var_nulls = calc_var(v, ddof, host_bools); + double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } // ---------------------------------------------------------------------------- @@ -1140,23 +1145,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [ddof](std::vector& v, cudf::size_type valid_count) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::fixed_width_column_wrapper col(int_values.begin(), int_values.end()); - double var = calc_var(int_values, int_values.size()); + double var = calc_var(int_values, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -1173,23 +1165,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(int_values, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(int_values, host_bools, int{0}); - - double var_nulls = calc_var(replaced_array, valid_count); + double var_nulls = calc_var(int_values, ddof, host_bools); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } //------------------------------------------------------------------- @@ -1254,7 +1242,7 @@ struct StringReductionTest : public cudf::test::BaseFixture, }; // ------------------------------------------------------------------------ -std::vector string_list[] = { +std::vector> string_list{{ {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"one", "", "three", "four", "five", "six", "seven", "eight", "nine"}, @@ -1264,7 +1252,7 @@ std::vector string_list[] = { {"\xF7\xBF\xBF\xBF", "", "", "", "", "", "", "", ""}, {"one", "two", "three", "four", "\xF7\xBF\xBF\xBF", "six", "seven", "eight", "nine"}, {"one", "two", "\xF7\xBF\xBF\xBF", "four", "five", "six", "seven", "eight", "nine"}, -}; +}}; INSTANTIATE_TEST_CASE_P(string_cases, StringReductionTest, testing::ValuesIn(string_list)); TEST_P(StringReductionTest, MinMax) { @@ -2235,7 +2223,7 @@ TYPED_TEST(ReductionTest, NthElement) struct DictionaryStringReductionTest : public StringReductionTest {}; -std::vector data_list[] = { +std::vector> data_list = { {"nine", "two", "five", "three", "five", "six", "two", "eight", "nine"}, }; INSTANTIATE_TEST_CASE_P(dictionary_cases, @@ -2472,21 +2460,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector v = convert_values(int_values); cudf::data_type output_type{cudf::type_to_id()}; - auto calc_var = [](std::vector const& v, cudf::size_type valid_count, cudf::size_type ddof) { - double mean = std::accumulate(v.cbegin(), v.cend(), double{0}); - mean /= valid_count; - double sum_of_sq = std::accumulate( - v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - auto const div = valid_count - ddof; - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::dictionary_column_wrapper col(v.begin(), v.end()); cudf::size_type const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -2498,15 +2476,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector validity({true, true, false, true, true, true, false, true}); cudf::test::dictionary_column_wrapper col_nulls(v.begin(), v.end(), validity.begin()); - cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true); - - double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof); + double var_nulls = calc_var(v, ddof, validity); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, - var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, - std_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, + var_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, + std_nulls); } TYPED_TEST(DictionaryReductionTest, NthElement) diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index 76dbbaef491..c4463d68a68 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -415,8 +415,8 @@ TEST_F(ScanStringsTest, MoreStringsMinMax) int row_count = 512; auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) { - char const s[] = {static_cast('a' + (idx % 26)), 0}; - return std::string(s); + char const s = static_cast('a' + (idx % 26)); + return std::string{1, s}; }); auto validity = cudf::detail::make_counting_transform_iterator( 0, [](auto idx) -> bool { return (idx % 23) != 22; }); diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp index 19996f827cf..bc0321bd40a 100644 --- a/cpp/tests/reductions/segmented_reduction_tests.cpp +++ b/cpp/tests/reductions/segmented_reduction_tests.cpp @@ -1092,11 +1092,10 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets) auto aggregates = std::vector>>(); - aggregates.push_back(std::move(cudf::make_max_aggregation())); - aggregates.push_back(std::move(cudf::make_min_aggregation())); - aggregates.push_back(std::move(cudf::make_sum_aggregation())); - aggregates.push_back( - std::move(cudf::make_product_aggregation())); + aggregates.push_back(cudf::make_max_aggregation()); + aggregates.push_back(cudf::make_min_aggregation()); + aggregates.push_back(cudf::make_sum_aggregation()); + aggregates.push_back(cudf::make_product_aggregation()); auto output_type = cudf::data_type{cudf::type_to_id()}; for (auto&& agg : aggregates) { diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index 1858cd7782e..b12bf08520f 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -356,7 +356,7 @@ void test_replace(cudf::host_span input_column, for (size_t i = 0; i < values_to_replace_column.size(); i++) { size_t k = 0; - auto pred = [=, &k, &reference_result, &expected_valid, &isReplaced](T element) { + auto pred = [=, &k, &expected_valid, &isReplaced](T element) { bool toBeReplaced = false; if (!isReplaced[k]) { if (!input_has_nulls || expected_valid[k]) { @@ -503,7 +503,7 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest) const size_t REPLACE_SIZE = 10000; thrust::host_vector input_column(DATA_SIZE); - std::generate(std::begin(input_column), std::end(input_column), [REPLACE_SIZE]() { + std::generate(std::begin(input_column), std::end(input_column), []() { return std::rand() % (REPLACE_SIZE); }); diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index f702dc78371..165e0347785 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -214,7 +214,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_list_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); @@ -338,7 +338,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 4, 8, 12, 12, 12}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -373,7 +373,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 3, 5, 8, 8, 8}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -1499,7 +1499,7 @@ TYPED_TEST(TypedCollectSetTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_set_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2}, {1, 2, 4}, {2, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp index 9cc8b6dec81..2444992e68f 100644 --- a/cpp/tests/rolling/nth_element_test.cpp +++ b/cpp/tests/rolling/nth_element_test.cpp @@ -83,7 +83,7 @@ class rolling_exec { return *this; } - std::unique_ptr test_grouped_nth_element( + [[nodiscard]] std::unique_ptr test_grouped_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::grouped_rolling_window( @@ -96,7 +96,7 @@ class rolling_exec { n, null_handling.value_or(_null_handling))); } - std::unique_ptr test_nth_element( + [[nodiscard]] std::unique_ptr test_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::rolling_window(_input, diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index ec726878b34..0eaab0c9f7a 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,11 @@ using cudf::test::iterators::nulls_at; auto constexpr null = int32_t{0}; // NULL representation for int32_t; +// clang-tidy doesn't think std::transform can handle a +// thrust::constant_iterator, so this is a workaround that uses nulls_at +// instead of no_nulls +auto no_nulls_list() { return nulls_at({}); } + struct OffsetRowWindowTest : public cudf::test::BaseFixture { static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -210,7 +215,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), - lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls}); + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, + no_nulls_list()}); } TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) @@ -250,7 +256,7 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}}, - no_nulls}); + no_nulls_list()}); } // To test that preceding bounds are clamped correctly at group boundaries. diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index c2c22986975..6e0dc16dca9 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -541,7 +541,7 @@ class RollingTest : public cudf::test::BaseFixture { agg_op op; for (cudf::size_type i = 0; i < num_rows; i++) { - OutputType val = agg_op::template identity(); + auto val = agg_op::template identity(); // load sizes min_periods = std::max(min_periods, 1); // at least one observation is required diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp index 2d37de920d5..2b79911a95a 100644 --- a/cpp/tests/scalar/scalar_test.cpp +++ b/cpp/tests/scalar/scalar_test.cpp @@ -190,7 +190,7 @@ TEST_F(ListScalarTest, MoveConstructorNonNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(data_ptr, s2.view().data()); - EXPECT_EQ(s.view().data(), nullptr); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT } TEST_F(ListScalarTest, MoveConstructorNested) @@ -205,8 +205,8 @@ TEST_F(ListScalarTest, MoveConstructorNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(offset_ptr, s2.view().child(0).data()); EXPECT_EQ(data_ptr, s2.view().child(1).data()); - EXPECT_EQ(s.view().data(), nullptr); - EXPECT_EQ(s.view().num_children(), 0); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT + EXPECT_EQ(s.view().num_children(), 0); // NOLINT } struct StructScalarTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp index 48711c21715..7584003e800 100644 --- a/cpp/tests/search/search_list_test.cpp +++ b/cpp/tests/search/search_list_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,6 @@ using strings_col = cudf::test::strings_column_wrapper; constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; constexpr int32_t null{0}; // Mark for null child elements at the current level -constexpr int32_t XXX{0}; // Mark for null elements at all levels using TestTypes = cudf::test::Concat> grand_child; - grand_child.push_back(std::move(col4.release())); + grand_child.push_back(col4.release()); auto child_col_2 = cudf::make_structs_column(6, std::move(grand_child), 0, rmm::device_buffer{}); child_columns2.push_back(std::move(child_col_2)); auto struct_col3 = diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp index 4d7d23dc881..d5b6915b520 100644 --- a/cpp/tests/stream_compaction/unique_tests.cpp +++ b/cpp/tests/stream_compaction/unique_tests.cpp @@ -43,7 +43,6 @@ auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; auto constexpr KEEP_LAST = cudf::duplicate_keep_option::KEEP_LAST; auto constexpr KEEP_NONE = cudf::duplicate_keep_option::KEEP_NONE; -auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; using int32s_col = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp index 443f4548b2c..07b2d77cc04 100644 --- a/cpp/tests/streams/stream_compaction_test.cpp +++ b/cpp/tests/streams/stream_compaction_test.cpp @@ -29,8 +29,6 @@ #include -auto constexpr null{0}; // null at current level -auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp index 52839c6fc9f..e5a1ee0988c 100644 --- a/cpp/tests/streams/strings/find_test.cpp +++ b/cpp/tests/streams/strings/find_test.cpp @@ -46,4 +46,5 @@ TEST_F(StringsFindTest, Find) auto const pattern = std::string("[a-z]"); auto const prog = cudf::strings::regex_program::create(pattern); cudf::strings::findall(view, *prog, cudf::test::get_default_stream()); + cudf::strings::find_re(view, *prog, cudf::test::get_default_stream()); } diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp index 9187672221c..cf81dc6fb42 100644 --- a/cpp/tests/streams/transform_test.cpp +++ b/cpp/tests/streams/transform_test.cpp @@ -32,7 +32,7 @@ class TransformTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 7e530b2a34d..5923f8dee5a 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsCharsTest : public cudf::test::BaseFixture {}; @@ -50,20 +51,20 @@ TEST_P(CharsTypes, AllTypes) "de", "\t\r\n\f "}; - bool expecteds[] = {false, false, false, false, false, false, false, false, - false, false, false, false, false, true, false, false, // decimal - false, false, false, false, false, false, false, false, - false, true, false, true, false, true, false, false, // numeric - false, false, false, false, false, false, false, false, - false, false, false, true, false, true, false, false, // digit - true, true, false, true, false, false, false, false, - false, false, false, false, false, false, true, false, // alpha - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, true, // space - false, false, false, true, false, false, false, false, - false, false, false, false, false, false, false, false, // upper - false, true, false, false, false, false, false, false, - false, false, false, false, false, false, true, false}; // lower + std::array expecteds{false, false, false, false, false, false, false, false, + false, false, false, false, false, true, false, false, // decimal + false, false, false, false, false, false, false, false, + false, true, false, true, false, true, false, false, // numeric + false, false, false, false, false, false, false, false, + false, false, false, true, false, true, false, false, // digit + true, true, false, true, false, false, false, false, + false, false, false, false, false, false, true, false, // alpha + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, true, // space + false, false, false, true, false, false, false, false, + false, false, false, false, false, false, false, false, // upper + false, true, false, false, false, false, false, false, + false, false, false, false, false, false, true, false}; // lower auto is_parm = GetParam(); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index acf850c7a66..bdfd38267e6 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -32,6 +32,7 @@ #include #include +#include #include struct StringsContainsTests : public cudf::test::BaseFixture {}; @@ -167,10 +168,8 @@ TEST_F(StringsContainsTests, MatchesTest) auto strings_view = cudf::strings_column_view(strings); { auto const pattern = std::string("lazy"); - bool h_expected[] = {false, false, true, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, true, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -178,10 +177,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("\\d+"); - bool h_expected[] = {false, false, false, true, true, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, true, true, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -189,10 +186,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("@\\w+"); - bool h_expected[] = {false, false, false, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -200,10 +195,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string(".*"); - bool h_expected[] = {true, true, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {true, true, true, true, true, false, true}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -335,9 +328,9 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter) { std::vector data(10); std::generate(data.begin(), data.end(), [n = 0]() mutable { - char first = static_cast('A' + n++); - char raw_data[] = {first, '\0', 'B'}; - return std::string{raw_data, 3}; + char first = static_cast('A' + n++); + std::array raw_data = {first, '\0', 'B'}; + return std::string{raw_data.data(), 3}; }); cudf::test::strings_column_wrapper input(data.begin(), data.end()); auto strings_view = cudf::strings_column_view(input); @@ -749,11 +742,11 @@ TEST_F(StringsContainsTests, ASCII) auto input = cudf::test::strings_column_wrapper({"abc \t\f\r 12", "áé  ❽❽", "aZ ❽4", "XYZ 8"}); auto view = cudf::strings_column_view(input); - std::string patterns[] = {R"(\w+[\s]+\d+)", - R"([^\W]+\s+[^\D]+)", - R"([\w]+[^\S]+[\d]+)", - R"([\w]+\s+[\d]+)", - R"(\w+\s+\d+)"}; + std::array patterns = {R"(\w+[\s]+\d+)", + R"([^\W]+\s+[^\D]+)", + R"([\w]+[^\S]+[\d]+)", + R"([\w]+\s+[\d]+)", + R"(\w+\s+\d+)"}; for (auto ptn : patterns) { auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0}); @@ -787,24 +780,18 @@ TEST_F(StringsContainsTests, MediumRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } @@ -828,24 +815,18 @@ TEST_F(StringsContainsTests, LargeRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp index 86189b29981..f2e31339035 100644 --- a/cpp/tests/strings/durations_tests.cpp +++ b/cpp/tests/strings/durations_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsDurationsTest : public cudf::test::BaseFixture {}; @@ -403,17 +404,17 @@ TEST_F(StringsDurationsTest, ParseSingle) "01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; - auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 3600}; }); + std::array expected_v{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; + auto it1 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 3600}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 60}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 60}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -421,14 +422,14 @@ TEST_F(StringsDurationsTest, ParseSingle) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s2); auto it3 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s3(it3, it3 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s3); - auto it4 = thrust::make_transform_iterator(expected_v, + auto it4 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i * 60000}; }); cudf::test::fixed_width_column_wrapper expected_ms(it4, it4 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -454,21 +455,21 @@ TEST_F(StringsDurationsTest, ParseMultiple) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0, - -1, - -(3600 + 60 + 1), - 23 * 3600 + 1, - -(23 * 3600 + 1), - 59 * 3600, - -59 * 3600, - 99 * 3600, - -99 * 3600, - 0, - 3661, - 0}; + std::array expected_v{0, + 0, + -1, + -(3600 + 60 + 1), + 23 * 3600 + 1, + -(23 * 3600 + 1), + 59 * 3600, + -59 * 3600, + 99 * 3600, + -99 * 3600, + 0, + 3661, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -476,7 +477,7 @@ TEST_F(StringsDurationsTest, ParseMultiple) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -508,28 +509,28 @@ TEST_F(StringsDurationsTest, ParseSubsecond) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int64_t expected_v[]{0, - -123456789L, - -1000666999L, - -((3600 + 60 + 1) * 1000000000L + 100000000L), - (23 * 3600 + 1) * 1000000000L + 80L, - -((23 * 3600 + 1) * 1000000000L + 123000000L), - (59 * 3600) * 1000000000L, - -(59 * 3600) * 1000000000L, - (99 * 3600) * 1000000000L, - -(99 * 3600) * 1000000000L, - 0, - (3661) * 1000000000L, - 0}; + std::array expected_v{0, + -123456789L, + -1000666999L, + -((3600 + 60 + 1) * 1000000000L + 100000000L), + (23 * 3600 + 1) * 1000000000L + 80L, + -((23 * 3600 + 1) * 1000000000L + 123000000L), + (59 * 3600) * 1000000000L, + -(59 * 3600) * 1000000000L, + (99 * 3600) * 1000000000L, + -(99 * 3600) * 1000000000L, + 0, + (3661) * 1000000000L, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ns{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ns{i}; }); cudf::test::fixed_width_column_wrapper expected_ns1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H:%M:%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns1); - auto it2 = thrust::make_transform_iterator(expected_v, + auto it2 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i / 1000000}; }); cudf::test::fixed_width_column_wrapper expected_ms2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -559,25 +560,25 @@ TEST_F(StringsDurationsTest, ParseAMPM) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 0, - 0 - 12 * 3600, - -1, - -1 - 12 * 3600, - -(3600 + 60 + 1), - -(3600 + 60 + 1) - 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - -(11 * 3600 + 59 * 60 + 59), - -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), - 0, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 0, + 0 - 12 * 3600, + -1, + -1 - 12 * 3600, + -(3600 + 60 + 1), + -(3600 + 60 + 1) - 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + -(11 * 3600 + 59 * 60 + 59), + -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), + 0, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -585,7 +586,7 @@ TEST_F(StringsDurationsTest, ParseAMPM) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -616,20 +617,20 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 1, - 1 + 12 * 3600, - (3600 + 60 + 1), - (3600 + 60 + 1) + 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 1, + 1 + 12 * 3600, + (3600 + 60 + 1), + (3600 + 60 + 1) + 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -641,8 +642,8 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "%OI:%OM:%OS %p"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ms{i * 1000}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_ms{i * 1000}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 1491da758d5..61246fb098d 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -275,8 +275,8 @@ TEST_F(StringsExtractTests, ExtractAllTest) auto pattern = std::string("(\\d+) (\\w+)"); - bool valids[] = {true, true, true, false, false, false, true}; - using LCW = cudf::test::lists_column_wrapper; + std::array valids{true, true, true, false, false, false, true}; + using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"123", "banana", "7", "eleven"}, LCW{"41", "apple"}, LCW{"6", "péar", "0", "pair"}, @@ -284,7 +284,7 @@ TEST_F(StringsExtractTests, ExtractAllTest) LCW{}, LCW{}, LCW{"4", "paré"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::extract_all_record(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 6eea1895fb1..4821a7fa999 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -33,10 +34,10 @@ struct StringsFindallTests : public cudf::test::BaseFixture {}; TEST_F(StringsFindallTests, FindallTest) { - bool valids[] = {true, true, true, true, true, false, true, true}; + std::array valids{true, true, true, true, true, false, true, true}; cudf::test::strings_column_wrapper input( {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"}, - valids); + valids.data()); auto sv = cudf::strings_column_view(input); auto pattern = std::string("(\\d+)-(\\w+)"); @@ -50,7 +51,7 @@ TEST_F(StringsFindallTests, FindallTest) LCW{}, LCW{}, LCW{"25-9000"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::findall(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); @@ -149,6 +150,22 @@ TEST_F(StringsFindallTests, LargeRegex) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, FindTest) +{ + auto const valids = cudf::test::iterators::null_at(5); + cudf::test::strings_column_wrapper input( + {"3A", "May4", "Jan2021", "March", "A9BC", "", "", "abcdef ghijklm 12345"}, valids); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("\\d+"); + + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::find_re(sv, *prog); + auto expected = + cudf::test::fixed_width_column_wrapper({0, 3, 3, -1, 1, 0, -1, 15}, valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + TEST_F(StringsFindallTests, NoMatches) { cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); @@ -169,10 +186,16 @@ TEST_F(StringsFindallTests, EmptyTest) auto prog = cudf::strings::regex_program::create(pattern); cudf::test::strings_column_wrapper input; - auto sv = cudf::strings_column_view(input); - auto results = cudf::strings::findall(sv, *prog); - - using LCW = cudf::test::lists_column_wrapper; - LCW expected; - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + auto sv = cudf::strings_column_view(input); + { + auto results = cudf::strings::findall(sv, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } + { + auto results = cudf::strings::find_re(sv, *prog); + auto expected = cudf::test::fixed_width_column_wrapper{}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } } diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index ce5f68de3c9..26bcfe8028d 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -425,7 +426,7 @@ TYPED_TEST(StringsIntegerConvertTest, IntegerToHex) if (v == 0) { return std::string("00"); } // special handling for single-byte types if constexpr (std::is_same_v || std::is_same_v) { - char const hex_digits[16] = { + std::array const hex_digits = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; std::string str; str += hex_digits[(v & 0xF0) >> 4]; diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index f0010fc1ed9..219bd6d8b01 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -635,9 +635,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild) auto mask_vec = std::vector{true, false, false}; auto [null_mask, null_count] = cudf::test::detail::make_null_mask(mask_vec.begin(), mask_vec.end()); - auto structs_col = - cudf::make_structs_column(num_rows, std::move(cols), null_count, std::move(null_mask)); - EXPECT_NO_THROW(structs_col->view()); + EXPECT_NO_THROW(auto structs_col = cudf::make_structs_column( + num_rows, std::move(cols), null_count, std::move(null_mask))); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index 215ca158f37..2684123c08a 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -32,7 +32,7 @@ struct MaskToNullTest : public cudf::test::BaseFixture { { cudf::test::fixed_width_column_wrapper input_column( input.begin(), input.end(), val.begin()); - std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and()); + std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<>()); auto sample = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index 5fa02d9978a..0bdf5b321ac 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -30,7 +30,7 @@ namespace transformation { struct UnaryOperationIntegrationTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); @@ -47,7 +47,7 @@ void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, boo TEST_F(UnaryOperationIntegrationTest, Transform_FP32_FP32) { // c = a*a*a*a - char const* cuda = + std::string const cuda = R"***( __device__ inline void fdsf ( float* C, @@ -58,7 +58,7 @@ __device__ inline void fdsf ( } )***"; - char const* ptx = + std::string const ptx = R"***( // // Generated by NVIDIA NVVM Compiler @@ -101,17 +101,17 @@ __device__ inline void fdsf ( auto op = [](dtype a) { return a * a * a * a; }; auto data_init = [](cudf::size_type row) { return row % 3; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) { // c = a * a - a - char const cuda[] = + std::string const cuda = "__device__ inline void f(int* output,int input){*output = input*input - input;}"; - char const* ptx = + std::string const ptx = R"***( .func _Z1fPii( .param .b64 _Z1fPii_param_0, @@ -136,8 +136,8 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) auto op = [](dtype a) { return a * a - a; }; auto data_init = [](cudf::size_type row) { return row % 78; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) @@ -145,7 +145,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) // Capitalize all the lower case letters // Assuming ASCII, the PTX code is compiled from the following CUDA code - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f( signed char* output, @@ -159,7 +159,7 @@ __device__ inline void f( } )***"; - char const ptx[] = + std::string const ptx = R"***( .func _Z1fPcc( .param .b64 _Z1fPcc_param_0, @@ -191,15 +191,15 @@ __device__ inline void f( auto op = [](dtype a) { return std::toupper(a); }; auto data_init = [](cudf::size_type row) { return 'a' + (row % 26); }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_Datetime) { // Add one day to timestamp in microseconds - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) { @@ -217,7 +217,7 @@ __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) auto random_eng = cudf::test::UniformRandomGenerator(0, 100000000); auto data_init = [&random_eng](cudf::size_type row) { return random_eng.generate(); }; - test_udf(cuda, op, data_init, 500, false); + test_udf(cuda.c_str(), op, data_init, 500, false); } } // namespace transformation diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu new file mode 100644 index 00000000000..98657f8e224 --- /dev/null +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +template +struct BatchedMemcpyTest : public cudf::test::BaseFixture {}; + +TEST(BatchedMemcpyTest, BasicTest) +{ + using T1 = int64_t; + + // Device init + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + // Buffer lengths (in number of elements) + std::vector const h_lens{ + 50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000}; + + // Total number of buffers + auto const num_buffs = h_lens.size(); + + // Exclusive sum of buffer lengths for pointers + std::vector h_lens_excl_sum(num_buffs); + std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0); + + // Corresponding buffer sizes (in bytes) + std::vector h_sizes_bytes; + h_sizes_bytes.reserve(num_buffs); + std::transform( + h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) { + return size * sizeof(T1); + }); + + // Initialize random engine + auto constexpr seed = 0xcead; + std::mt19937 engine{seed}; + using uniform_distribution = + typename std::conditional_t, + std::bernoulli_distribution, + std::conditional_t, + std::uniform_real_distribution, + std::uniform_int_distribution>>; + uniform_distribution dist{}; + + // Generate a src vector of random data vectors + std::vector> h_sources; + h_sources.reserve(num_buffs); + std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) { + std::vector data(size); + std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; }); + return data; + }); + // Copy the vectors to device + std::vector> h_device_vecs; + h_device_vecs.reserve(h_sources.size()); + std::transform( + h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) { + return cudf::detail::make_device_uvector_async(vec, stream, mr); + }); + // Pointers to the source vectors + std::vector h_src_ptrs; + h_src_ptrs.reserve(h_sources.size()); + std::transform( + h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) { + return static_cast(vec.data()); + }); + // Copy the source data pointers to device + auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr); + + // Total number of elements in all buffers + auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0); + + // Create one giant buffer for destination + auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async(total_buff_len, stream, mr); + // Pointers to destination buffers within the giant destination buffer + std::vector h_dst_ptrs(num_buffs); + std::for_each(thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(num_buffs), + [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; }); + // Copy destination data pointers to device + auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr); + + // Copy buffer size iterators (in bytes) to device + auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr); + + // Run the batched memcpy + cudf::detail::batched_memcpy_async( + d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream); + + // Expected giant destination buffer after the memcpy + std::vector expected_buffer; + expected_buffer.reserve(total_buff_len); + std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) { + expected_buffer.insert(expected_buffer.end(), source.begin(), source.end()); + }); + + // Copy over the result destination buffer to host and synchronize the stream + auto result_dst_buffer = + cudf::detail::make_std_vector_sync(cudf::device_span(d_dst_data), stream); + + // Check if both vectors are equal + EXPECT_TRUE( + std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin())); +} diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu index bed0f40d70e..0eeb7b95318 100644 --- a/cpp/tests/utilities_tests/batched_memset_tests.cu +++ b/cpp/tests/utilities_tests/batched_memset_tests.cu @@ -18,8 +18,8 @@ #include #include +#include #include -#include #include #include #include @@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1) }); // Function Call - cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream); + cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream); // Set all buffer regions to 0 for expected comparison std::for_each( diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index d052e20eedb..cfab570833b 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture { std::vector prev_sinks; public: - LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()} + LoggerTest() + : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} { - cudf::logger().sinks() = {std::make_shared(oss)}; - cudf::logger().set_formatter( + cudf::detail::logger().sinks() = {std::make_shared(oss)}; + cudf::detail::logger().set_formatter( std::unique_ptr(new spdlog::pattern_formatter("%v"))); } ~LoggerTest() override { - cudf::logger().set_level(prev_level); - cudf::logger().sinks() = prev_sinks; + cudf::detail::logger().set_level(prev_level); + cudf::detail::logger().sinks() = prev_sinks; } void clear_sink() { oss.str(""); } @@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::logger().critical("crit msg"); + cudf::detail::logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); - cudf::logger().info("info"); - cudf::logger().warn("warn"); - cudf::logger().error("error"); - cudf::logger().critical("critical"); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); + cudf::detail::logger().error("error"); + cudf::detail::logger().critical("critical"); ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::logger().set_level(spdlog::level::warn); - cudf::logger().info("info"); - cudf::logger().warn("warn"); + cudf::detail::logger().set_level(spdlog::level::warn); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::logger().set_level(spdlog::level::debug); - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); + cudf::detail::logger().set_level(spdlog::level::debug); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/dependencies.yaml b/dependencies.yaml index ed36a23e5c3..3561b22965d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -15,6 +15,7 @@ files: - depends_on_cupy - depends_on_libkvikio - depends_on_librmm + - depends_on_nvcomp - depends_on_rmm - develop - docs @@ -152,6 +153,13 @@ files: - build_cpp - depends_on_libkvikio - depends_on_librmm + py_run_libcudf: + output: pyproject + pyproject_dir: python/libcudf + extras: + table: project + includes: + - depends_on_nvcomp py_build_pylibcudf: output: pyproject pyproject_dir: python/pylibcudf @@ -367,9 +375,27 @@ dependencies: - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 + - spdlog>=1.14.1,<1.15 + depends_on_nvcomp: + common: + - output_types: conda + packages: # Align nvcomp version with rapids-cmake - nvcomp==4.0.1 - - spdlog>=1.14.1,<1.15 + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + packages: + - nvidia-nvcomp-cu12==4.0.1 + - matrix: + cuda: "11.*" + packages: + - nvidia-nvcomp-cu11==4.0.1 + - matrix: + packages: + - nvidia-nvcomp==4.0.1 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] @@ -576,7 +602,7 @@ dependencies: packages: - fsspec>=0.6.0 - &numpy numpy>=1.23,<3.0a0 - - pandas>=2.0,<2.2.3dev0 + - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: - output_types: [conda, requirements, pyproject] @@ -722,6 +748,10 @@ dependencies: packages: - *numba-cuda-dep - pandas==2.0.* + - matrix: {dependencies: "latest"} + packages: + - numba-cuda==0.0.15 + - pandas==2.2.3 - matrix: packages: - output_types: conda diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index a8a6d81d6fb..911a64fa152 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -136,3 +136,21 @@ Arrays are not almost equal to 7 decimals ACTUAL: 1.0 DESIRED: 2.0. ``` + +Setting the environment variable `CUDF_PANDAS_FAIL_ON_FALLBACK` causes `cudf.pandas` to fail when falling back from cuDF to Pandas. +For example, +```python +import cudf.pandas +cudf.pandas.install() +import pandas as pd +import numpy as np + +df = pd.DataFrame({ + 'complex_col': [1 + 2j, 3 + 4j, 5 + 6j] +}) + +print(df) +``` +``` +ProxyFallbackError: The operation failed with cuDF, the reason was : Series with Complex128DType is not supported. +``` diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md index f12f809d5db..22cc1b5b8de 100644 --- a/docs/cudf/source/developer_guide/testing.md +++ b/docs/cudf/source/developer_guide/testing.md @@ -7,6 +7,23 @@ specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf). Each PR also indicates whether it increases or decreases test coverage. +### Configuring pytest + +Pytest will accept configuration in [multiple different +files](https://docs.pytest.org/en/stable/reference/customize.html), +with a specified discovery and precedence order. Note in particular +that there is no automatic "include" mechanism, as soon as a matching +configuration file is found, discovery stops. + +For preference, so that all tool configuration lives in the same +place, we use `pyproject.toml`-based configuration. Test configuration +for a given package should live in that package's `pyproject.toml` +file. + +Where tests do not naturally belong to a project, for example the +`cudf.pandas` integration tests and the cuDF benchmarks, use a +`pytest.ini` file as close to the tests as possible. + ## Test organization How tests are organized depends on which of the following two groups they fall into: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e21536e2e97..052479d6720 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -49,3 +49,4 @@ This page provides API documentation for pylibcudf. io/index.rst strings/index.rst + nvtext/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst new file mode 100644 index 00000000000..abb45e426a8 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst @@ -0,0 +1,6 @@ +============= +edit_distance +============= + +.. automodule:: pylibcudf.nvtext.edit_distance + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst new file mode 100644 index 00000000000..d68199271bd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst @@ -0,0 +1,6 @@ +=============== +generate_ngrams +=============== + +.. automodule:: pylibcudf.nvtext.generate_ngrams + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst new file mode 100644 index 00000000000..2e03b589c8b --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -0,0 +1,8 @@ +nvtext +====== + +.. toctree:: + :maxdepth: 1 + + edit_distance + generate_ngrams diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst new file mode 100644 index 00000000000..8e86b33b1a0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst @@ -0,0 +1,6 @@ +============= +find_multiple +============= + +.. automodule:: pylibcudf.strings.find_multiple + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 9b1a6b72a88..48dc8a13c3e 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -9,10 +9,15 @@ strings contains extract find + find_multiple findall + padding regex_flags regex_program repeat replace + side_type slice + split strip + wrap diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst new file mode 100644 index 00000000000..5b417024fd5 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst @@ -0,0 +1,6 @@ +======= +padding +======= + +.. automodule:: pylibcudf.strings.padding + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst new file mode 100644 index 00000000000..d5aef9c4f75 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst @@ -0,0 +1,6 @@ +========= +side_type +========= + +.. automodule:: pylibcudf.strings.side_type + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst new file mode 100644 index 00000000000..cba96e86f45 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst @@ -0,0 +1,6 @@ +===== +split +===== + +.. automodule:: pylibcudf.strings.split + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst new file mode 100644 index 00000000000..bd825f78568 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst @@ -0,0 +1,6 @@ +==== +wrap +==== + +.. automodule:: pylibcudf.strings.wrap + :members: diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java index 8cc7df1ce7f..6bd6603d71b 100644 --- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java @@ -612,13 +612,13 @@ void testWithSetOutputType() { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.666667f); + try (Scalar expected = Scalar.fromFloat(1.6666666f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.variance(DType.FLOAT32)) { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.2909945f); + try (Scalar expected = Scalar.fromFloat(1.2909944f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.standardDeviation(DType.FLOAT32)) { assertEquals(expected, result); diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index bc5e085ec39..d844466120f 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -13,12 +13,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.filling cimport calendrical_month_sequence from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.datetime import DatetimeComponent from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar -import pylibcudf as plc - @acquire_spill_lock() def add_months(Column col, Column months): @@ -40,9 +39,39 @@ def add_months(Column col, Column months): @acquire_spill_lock() def extract_datetime_component(Column col, object field): - result = Column.from_pylibcudf( - plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) - ) + + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + cdef libcudf_datetime.datetime_component component + + component_names = { + "year": DatetimeComponent.YEAR, + "month": DatetimeComponent.MONTH, + "day": DatetimeComponent.DAY, + "weekday": DatetimeComponent.WEEKDAY, + "hour": DatetimeComponent.HOUR, + "minute": DatetimeComponent.MINUTE, + "second": DatetimeComponent.SECOND, + "millisecond": DatetimeComponent.MILLISECOND, + "microsecond": DatetimeComponent.MICROSECOND, + "nanosecond": DatetimeComponent.NANOSECOND, + } + if field == "day_of_year": + with nogil: + c_result = move(libcudf_datetime.day_of_year(col_view)) + elif field in component_names: + component = component_names[field] + with nogil: + c_result = move( + libcudf_datetime.extract_datetime_component( + col_view, + component + ) + ) + else: + raise ValueError(f"Invalid field: '{field}'") + + result = Column.from_unique_ptr(move(c_result)) if field == "weekday": # Pandas counts Monday-Sunday as 0-6 diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx index e3c2273345a..3dd99c42d76 100644 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx @@ -2,37 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) +from pylibcudf cimport nvtext from cudf._lib.column cimport Column @acquire_spill_lock() def edit_distance(Column strings, Column targets): - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def edit_distance_matrix(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance_matrix( + strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index 6591b527eec..7fdf9258b7f 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -2,75 +2,34 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, - hash_character_ngrams as cpp_hash_character_ngrams, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def generate_ngrams(Column strings, int ngrams, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams, - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams, + py_separator.device_value.c_value + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def generate_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def hash_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.hash_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..d9595f4ab0a 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -3,25 +3,13 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from libcpp.memory cimport unique_ptr -from libcpp.string cimport string from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, to_floats as cpp_to_floats, @@ -33,11 +21,6 @@ from pylibcudf.libcudf.strings.convert.convert_integers cimport ( is_hex as cpp_is_hex, to_integers as cpp_to_integers, ) -from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( - integers_to_ipv4 as cpp_integers_to_ipv4, - ipv4_to_integers as cpp_ipv4_to_integers, - is_ipv4 as cpp_is_ipv4, -) from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id @@ -427,77 +410,21 @@ def stoul(Column input_col): return string_to_integer(input_col, cudf.dtype("uint64")) -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) - - def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_booleans.to_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + ) + return Column.from_pylibcudf(plc_column) def from_booleans(Column input_col): - return _from_booleans(input_col) + plc_column = plc.strings.convert.convert_booleans.from_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + as_device_scalar("False").c_value, + ) + return Column.from_pylibcudf(plc_column) def int2timestamp( @@ -520,11 +447,10 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef string c_timestamp_format = format.encode("UTF-8") return Column.from_pylibcudf( plc.strings.convert.convert_datetime.from_timestamps( input_col.to_pylibcudf(mode="read"), - c_timestamp_format, + format, names.to_pylibcudf(mode="read") ) ) @@ -545,12 +471,11 @@ def timestamp2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_datetime.to_timestamps( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -572,16 +497,11 @@ def istimestamp(Column input_col, str format): """ if input_col.size == 0: return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_datetime.is_timestamp( + input_col.to_pylibcudf(mode="read"), + format + ) + return Column.from_pylibcudf(plc_column) def timedelta2int(Column input_col, dtype, format): @@ -599,12 +519,11 @@ def timedelta2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.to_durations( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -623,12 +542,10 @@ def int2timedelta(Column input_col, str format): A Column with Timedelta represented in string format """ - - cdef string c_duration_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.from_durations( input_col.to_pylibcudf(mode="read"), - c_duration_format + format ) ) @@ -646,14 +563,10 @@ def int2ip(Column input_col): A Column with integer represented in string ipv4 format """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_ipv4(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def ip2int(Column input_col): @@ -669,14 +582,10 @@ def ip2int(Column input_col): A Column with ipv4 represented as integer """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ipv4_to_integers(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def is_ipv4(Column source_strings): @@ -685,15 +594,10 @@ def is_ipv4(Column source_strings): that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn where nnn is integer digits in [0,255]. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_ipv4( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.is_ipv4( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def htoi(Column input_col, **kwargs): diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 4bf8a9b1a8f..e712937f816 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -71,16 +71,9 @@ startswith_multiple, ) from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import findall +from cudf._lib.strings.findall import find_re, findall from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object -from cudf._lib.strings.padding import ( - SideType, - center, - ljust, - pad, - rjust, - zfill, -) +from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( insert, diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index 376a6f8af97..a57ce29eb45 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -1,23 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.char_types cimport ( - all_characters_of_type as cpp_all_characters_of_type, - filter_characters_of_type as cpp_filter_characters_of_type, - string_character_types, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf.strings import char_types @acquire_spill_lock() @@ -25,26 +14,15 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True): """ Returns a Column of strings keeping only alphanumeric character types. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_column = char_types.filter_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALL_TYPES if keep + else char_types.StringCharacterTypes.ALPHANUM, + py_repl.device_value.c_value, + char_types.StringCharacterTypes.ALPHANUM if keep + else char_types.StringCharacterTypes.ALL_TYPES ) - - with nogil: - c_result = move(cpp_filter_characters_of_type( - source_view, - string_character_types.ALL_TYPES if keep - else string_character_types.ALPHANUM, - scalar_repl[0], - string_character_types.ALPHANUM if keep - else string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -54,17 +32,12 @@ def is_decimal(Column source_strings): that contain only decimal characters -- those that can be used to extract base10 numbers. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DECIMAL, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DECIMAL, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -75,17 +48,12 @@ def is_alnum(Column source_strings): Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHANUM, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHANUM, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -94,17 +62,12 @@ def is_alpha(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only alphabetic characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHA, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHA, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -113,17 +76,12 @@ def is_digit(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only decimal and digit characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DIGIT, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DIGIT, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -133,17 +91,12 @@ def is_numeric(Column source_strings): that contain only numeric characters. These include digit and numeric characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.NUMERIC, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.NUMERIC, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -152,17 +105,12 @@ def is_upper(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only upper-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.UPPER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.UPPER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -171,17 +119,12 @@ def is_lower(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only lower-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.LOWER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.LOWER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -190,14 +133,9 @@ def is_space(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contains all characters which are spaces only. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.SPACE, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.SPACE, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index a8df8c9a92c..96dcd021c3b 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -1,22 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import cudf - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport ( - from_fixed_point as cpp_from_fixed_point, - is_fixed_point as cpp_is_fixed_point, - to_fixed_point as cpp_to_fixed_point, -) -from pylibcudf.libcudf.types cimport data_type, type_id - from cudf._lib.column cimport Column +from cudf._lib.types cimport dtype_to_pylibcudf_type + +import pylibcudf as plc @acquire_spill_lock() @@ -32,14 +21,10 @@ def from_decimal(Column input_col): ------- A column of strings representing the input decimal values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_fixed_point( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type): ------- A column of decimals parsed from the string values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef int scale = out_type.scale - cdef data_type c_out_type - if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(type_id.DECIMAL32, -scale) - elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(type_id.DECIMAL64, -scale) - elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(type_id.DECIMAL128, -scale) - else: - raise TypeError("should be a decimal dtype") - with nogil: - c_result = move( - cpp_to_fixed_point( - input_column_view, - c_out_type)) - - result = Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type), + ) + result = Column.from_pylibcudf(plc_column) result.dtype.precision = out_type.precision return result @@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype): ------- A Column of booleans indicating valid decimal conversion. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = input_col.view() - cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) - with nogil: - c_result = move(cpp_is_fixed_point( - source_view, - c_dtype - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(dtype), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx index 1358f8e3c2c..39e0013769f 100644 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.find_multiple cimport ( - find_multiple as cpp_find_multiple, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def find_multiple(Column source_strings, Column target_strings): @@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings): Returns a column with character position values where each of the `target_strings` are found in each string of `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - - with nogil: - c_result = move(cpp_find_multiple( - source_view, - target_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.find_multiple.find_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 0e758d5b322..3e7a504d535 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -23,3 +23,19 @@ def findall(Column source_strings, object pattern, uint32_t flags): prog, ) return Column.from_pylibcudf(plc_result) + + +@acquire_spill_lock() +def find_re(Column source_strings, object pattern, uint32_t flags): + """ + Returns character positions where the pattern first matches + the elements in source_strings. + """ + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.find_re( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx index d0239e91ec3..015a2ebab8a 100644 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ b/python/cudf/cudf/_lib/strings/padding.pyx @@ -1,64 +1,31 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from enum import IntEnum - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.strings.padding cimport ( - pad as cpp_pad, - zfill as cpp_zfill, -) -from pylibcudf.libcudf.strings.side_type cimport ( - side_type, - underlying_type_t_side_type, -) - - -class SideType(IntEnum): - LEFT = side_type.LEFT - RIGHT = side_type.RIGHT - BOTH = side_type.BOTH +import pylibcudf as plc @acquire_spill_lock() def pad(Column source_strings, size_type width, fill_char, - side=SideType.LEFT): + side=plc.strings.side_type.SideType.LEFT): """ Returns a Column by padding strings in `source_strings` up to the given `width`. Direction of padding is to be specified by `side`. The additional characters being filled can be changed by specifying `fill_char`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - cdef side_type pad_direction = ( - side + plc_result = plc.strings.padding.pad( + source_strings.to_pylibcudf(mode="read"), + width, + side, + fill_char, ) - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - pad_direction, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -68,19 +35,13 @@ def zfill(Column source_strings, Returns a Column by prepending strings in `source_strings` with '0' characters up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_zfill( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.padding.zfill( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) -@acquire_spill_lock() def center(Column source_strings, size_type width, fill_char): @@ -89,23 +50,9 @@ def center(Column source_strings, in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.BOTH, - f_char - )) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH) - return Column.from_unique_ptr(move(c_result)) - -@acquire_spill_lock() def ljust(Column source_strings, size_type width, fill_char): @@ -113,23 +60,9 @@ def ljust(Column source_strings, Returns a Column by filling right side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT) - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.RIGHT, - f_char - )) - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() def rjust(Column source_strings, size_type width, fill_char): @@ -137,17 +70,4 @@ def rjust(Column source_strings, Returns a Column by filling left side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.LEFT, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT) diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index a81fb18e752..5319addc41c 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -1,21 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.split.partition cimport ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -25,25 +14,11 @@ def partition(Column source_strings, Returns data by splitting the `source_strings` column at the first occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_partition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.partition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -53,22 +28,8 @@ def rpartition(Column source_strings, Returns a Column by splitting the `source_strings` column at the last occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rpartition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.rpartition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index f481fea4c51..4ec6c7073d8 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,33 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.split.split cimport ( - rsplit as cpp_rsplit, - rsplit_re as cpp_rsplit_re, - rsplit_record as cpp_rsplit_record, - rsplit_record_re as cpp_rsplit_record_re, - split as cpp_split, - split_re as cpp_split_re, - split_record as cpp_split_record, - split_record_re as cpp_split_record_re, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -39,26 +18,12 @@ def split(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -70,25 +35,12 @@ def split_record(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -100,26 +52,12 @@ def rsplit(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -160,24 +85,15 @@ def split_re(Column source_strings, Returns data by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -217,23 +124,15 @@ def split_record_re(Column source_strings, Returns a Column by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index 38ecb21a94c..982c5a600e7 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -1,18 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.side_type cimport side_type -from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar import pylibcudf as plc @@ -24,15 +14,12 @@ def strip(Column source_strings, The set of characters need be stripped from left and right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - return Column.from_pylibcudf( - plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.SideType.BOTH, - repl.c_value - ) + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.BOTH, + py_repl.device_value.c_value, ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -43,24 +30,12 @@ def lstrip(Column source_strings, The set of characters need be stripped from left side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.LEFT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.LEFT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -71,21 +46,9 @@ def rstrip(Column source_strings, The set of characters need be stripped from right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.RIGHT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.RIGHT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx index 3fad91bbfc0..3ef478532c2 100644 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ b/python/cudf/cudf/_lib/strings/translate.pyx @@ -1,25 +1,12 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.translate cimport ( - filter_characters as cpp_filter_characters, - filter_type, - translate as cpp_translate, -) -from pylibcudf.libcudf.types cimport char_utf8 - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +import pylibcudf as plc @acquire_spill_lock() @@ -29,30 +16,11 @@ def translate(Column source_strings, Translates individual characters within each string if present in the mapping_table. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - with nogil: - c_result = move(cpp_translate(source_view, c_mapping_table)) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.translate.translate( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -64,44 +32,11 @@ def filter_characters(Column source_strings, Removes or keeps individual characters within each string using the provided mapping_table. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_result = plc.strings.translate.filter_characters( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + plc.strings.translate.FilterType.KEEP + if keep else plc.strings.translate.FilterType.REMOVE, + py_repl.device_value.c_value ) - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - cdef filter_type c_keep - if keep is True: - c_keep = filter_type.KEEP - else: - c_keep = filter_type.REMOVE - - with nogil: - c_result = move(cpp_filter_characters( - source_view, - c_mapping_table, - c_keep, - scalar_repl[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx index eed5cf33b10..2b40f01f818 100644 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ b/python/cudf/cudf/_lib/strings/wrap.pyx @@ -1,17 +1,13 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def wrap(Column source_strings, @@ -21,14 +17,8 @@ def wrap(Column source_strings, in the Column to be formatted in paragraphs with length less than a given `width`. """ - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_wrap( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.wrap.wrap( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e2bdecbe67a..871ffc6269d 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -3,7 +3,7 @@ import pandas as pd from packaging import version -PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2") +PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3") PANDAS_VERSION = version.parse(pd.__version__) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de5ed15771d..864e87b5377 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1337,7 +1337,7 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): - new_cats = cudf.Series(new_cats)._column.unique() + new_cats = new_cats.unique() if cur_cats.equals(new_cats, check_dtypes=True): # TODO: Internal usages don't always need a copy; add a copy keyword diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d0ea4612a1b..2c9b0baa9b6 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -480,6 +480,11 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: if dtype == self.dtype: return self + elif isinstance(dtype, pd.DatetimeTZDtype): + raise TypeError( + "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. " + "Use tz_localize instead." + ) return libcudf.unary.cast(self, dtype=dtype) def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override] @@ -940,6 +945,16 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: def as_string_column(self) -> cudf.core.column.StringColumn: return self._local_time.as_string_column() + def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: + if isinstance(dtype, pd.DatetimeTZDtype) and dtype != self.dtype: + if dtype.unit != self.time_unit: + # TODO: Doesn't check that new unit is valid. + casted = self._with_type_metadata(dtype) + else: + casted = self + return casted.tz_convert(str(dtype.tz)) + return super().as_datetime_column(dtype) + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component( self._local_time, field diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3b8dd05c13a..f6ab91f2f01 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -180,9 +180,12 @@ def var( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "var", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def std( self, @@ -190,9 +193,12 @@ def std( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "std", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4463e3280df..b50e23bd52e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -11,6 +11,8 @@ import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf import cudf.api.types from cudf import _lib as libcudf @@ -2546,9 +2548,9 @@ def split( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.split_re(self._column, pat, n) + data = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split( + data = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2719,9 +2721,9 @@ def rsplit( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.rsplit_re(self._column, pat, n) + data = libstrings.rsplit_re(self._column, pat, n) else: - data, _ = libstrings.rsplit( + data = libstrings.rsplit( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2820,7 +2822,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.partition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2885,7 +2887,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2966,7 +2968,7 @@ def pad( raise TypeError(msg) try: - side = libstrings.SideType[side.upper()] + side = plc.strings.side_type.SideType[side.upper()] except KeyError: raise ValueError( "side has to be either one of {'left', 'right', 'both'}" @@ -3624,6 +3626,46 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) + def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: + """ + Find first occurrence of pattern or regular expression in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 (no flags) + Flags to pass through to the regex engine (e.g. re.MULTILINE) + + Returns + ------- + Series + A Series of position values where the pattern first matches + each string. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit', 'Cat']) + >>> s.str.find_re('[ti]') + 0 1 + 1 -1 + 2 4 + 3 2 + dtype: int32 + """ + if isinstance(pat, re.Pattern): + flags = pat.flags & ~re.U + pat = pat.pattern + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "Unsupported value for `flags` parameter" + ) + + data = libstrings.find_re(self._column, pat, flags) + return self._return_or_inplace(data) + def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ Find all first occurrences of patterns in the Series/Index. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 16b0aa95c35..79ed5a0e187 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6287,14 +6287,17 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): ) if not skipna and any(col.nullable for col in filtered._columns): - mask = DataFrame( + length = filtered._data.nrows + ca = ColumnAccessor( { - name: filtered._data[name]._get_mask_as_column() - if filtered._data[name].nullable - else as_column(True, length=len(filtered._data[name])) - for name in filtered._column_names - } + name: col._get_mask_as_column() + if col.nullable + else as_column(True, length=length) + for name, col in filtered._data.items() + }, + verify=False, ) + mask = DataFrame._from_data(ca) mask = mask.all(axis=1) else: mask = None @@ -6679,19 +6682,10 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) return Series._from_column(result, index=self.index) else: - result_df = DataFrame(result).set_index(self.index) + result_df = DataFrame(result, index=self.index) result_df._set_columns_like(prepared._data) return result_df - @_performance_tracking - def _columns_view(self, columns): - """ - Return a subset of the DataFrame's columns as a view. - """ - return DataFrame( - {col: self._data[col] for col in columns}, index=self.index - ) - @_performance_tracking def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame's columns based on the column dtypes. @@ -6763,8 +6757,6 @@ def select_dtypes(self, include=None, exclude=None): if not isinstance(exclude, (list, tuple)): exclude = (exclude,) if exclude is not None else () - df = DataFrame(index=self.index) - # cudf_dtype_from_pydata_dtype can distinguish between # np.float and np.number selection = tuple(map(frozenset, (include, exclude))) @@ -6820,12 +6812,12 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._column_labels_and_values: - infered_type = cudf_dtype_from_pydata_dtype(col.dtype) - if infered_type in inclusion: - df._insert(len(df._data), k, col) - - return df + to_select = [ + label + for label, dtype in self._dtypes + if cudf_dtype_from_pydata_dtype(dtype) in inclusion + ] + return self.loc[:, to_select] @ioutils.doc_to_parquet() def to_parquet( @@ -7331,7 +7323,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): cov = cupy.cov(self.values, ddof=ddof, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(cov)).set_index(cols) + df = DataFrame(cupy.asfortranarray(cov), index=cols) df._set_columns_like(self._data) return df @@ -7374,7 +7366,7 @@ def corr( corr = cupy.corrcoef(values, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) + df = DataFrame(cupy.asfortranarray(corr), index=cols) df._set_columns_like(self._data) return df diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index be05075a2cd..81b20488d8d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -27,6 +27,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor +from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex @@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): left_cols = list(self.grouping.keys.drop_duplicates()._columns) right_cols = list(result_index._columns) join_keys = [ - _match_join_keys(lcol, rcol, "left") + _match_join_keys(lcol, rcol, "inner") for lcol, rcol in zip(left_cols, right_cols) ] # TODO: In future, see if we can centralize # logic else where that has similar patterns. join_keys = map(list, zip(*join_keys)) - _, indices = libcudf.join.join( - *join_keys, - how="left", + # By construction, left and right keys are related by + # a permutation, so we can use an inner join. + left_order, right_order = libcudf.join.join( + *join_keys, how="inner" + ) + # left order is some permutation of the ordering we + # want, and right order is a matching gather map for + # the result table. Get the correct order by sorting + # the right gather map. + (right_order,) = libcudf.sort.sort_by_key( + [right_order], + [left_order], + [True], + ["first"], + stable=False, + ) + result = result._gather( + GatherMap.from_column_unchecked( + right_order, len(result), nullify=False + ) ) - result = result.take(indices) if not self._as_index: result = result.reset_index() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6de3981ba66..92d094d9de5 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -700,7 +700,10 @@ def _compute_validity_mask(self, index, row_tuple, max_length): lookup_dict[i] = row lookup = cudf.DataFrame(lookup_dict) frame = cudf.DataFrame._from_data( - ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ColumnAccessor( + dict(enumerate(index._columns)), + verify=False, + ) ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) @@ -780,18 +783,12 @@ def _index_and_downcast(self, result, index, index_key): index_key = index_key[0] slice_access = isinstance(index_key, slice) - out_index = cudf.DataFrame() - # Select the last n-k columns where n is the number of columns and k is + # Count the last n-k columns where n is the number of columns and k is # the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) - for k in range(size, len(index._data)): - out_index.insert( - out_index._num_columns, - k, - cudf.Series._from_column(index._columns[k]), - ) + num_selected = max(0, index.nlevels - size) # determine if we should downcast from a DataFrame to a Series need_downcast = ( @@ -814,16 +811,13 @@ def _index_and_downcast(self, result, index, index_key): result = cudf.Series._from_data( {}, name=tuple(col[0] for col in index._columns) ) - elif out_index._num_columns == 1: + elif num_selected == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - last_column = index._columns[-1] - out_index = cudf.Index._from_column( - last_column, name=index.names[-1] - ) - index = out_index - elif out_index._num_columns > 1: + *_, last_column = index._data.columns + index = cudf.Index._from_column(last_column, name=index.names[-1]) + elif num_selected > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 401fef67ee6..3d132c92d54 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -681,7 +681,7 @@ def _tile(A, reps): nval = len(value_vars) dtype = min_unsigned_type(nval) - if not var_name: + if var_name is None: var_name = "variable" if not value_vars: @@ -961,14 +961,14 @@ def _merge_sorted( ) -def _pivot(df, index, columns): +def _pivot(col_accessor: ColumnAccessor, index, columns) -> cudf.DataFrame: """ Reorganize the values of the DataFrame according to the given index and columns. Parameters ---------- - df : DataFrame + col_accessor : DataFrame index : cudf.Index Index labels of the result columns : cudf.Index @@ -985,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._column_labels_and_values: + for col_label, col in col_accessor.items(): names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1067,22 +1067,21 @@ def pivot(data, columns=None, index=no_default, values=no_default): 2 three """ - df = data values_is_list = True if values is no_default: - values = df._columns_view( - col for col in df._column_names if col not in (index, columns) - ) + cols_to_select = [ + col for col in data._column_names if col not in (index, columns) + ] + elif not isinstance(values, (list, tuple)): + cols_to_select = [values] + values_is_list = False else: - if not isinstance(values, (list, tuple)): - values = [values] - values_is_list = False - values = df._columns_view(values) + cols_to_select = values if index is no_default: - index = df.index + index = data.index else: - index = cudf.Index(df.loc[:, index]) - columns = cudf.Index(df.loc[:, columns]) + index = cudf.Index(data.loc[:, index]) + columns = cudf.Index(data.loc[:, columns]) # Create a DataFrame composed of columns from both # columns and index @@ -1096,7 +1095,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): if len(columns_index) != len(columns_index.drop_duplicates()): raise ValueError("Duplicate index-column pairs found. Cannot reshape.") - result = _pivot(values, index, columns) + result = _pivot(data._data.select_by_label(cols_to_select), index, columns) # MultiIndex to Index if not values_is_list: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index acd97c2047c..41ee94b72c8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2943,7 +2943,7 @@ def corr(self, other, method="pearson", min_periods=None): >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.corr(ser2, method="pearson") - -0.20454263717316112 + -0.20454263717316126 >>> ser1.corr(ser2, method="spearman") -0.5 """ diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index ef0f6958aeb..094df955273 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np @@ -10,6 +10,9 @@ from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class ExponentialMovingWindow(_RollingBase): r""" @@ -179,8 +182,10 @@ def cov( ): raise NotImplementedError("cov not yet supported.") - def _apply_agg_series(self, sr, agg_name): - if not is_numeric_dtype(sr.dtype): + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + if not is_numeric_dtype(source_column.dtype): raise TypeError("No numeric types to aggregate") # libcudf ewm has special casing for nulls only @@ -188,20 +193,14 @@ def _apply_agg_series(self, sr, agg_name): # pandas does nans in the same positions mathematically. # as such we need to convert the nans to nulls before # passing them in. - to_libcudf_column = sr._column.astype("float64").nans_to_nulls() - - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self( - [ - scan( - agg_name, - to_libcudf_column, - True, - com=self.com, - adjust=self.adjust, - ) - ] - ) + to_libcudf_column = source_column.astype("float64").nans_to_nulls() + + return scan( + agg_name, + to_libcudf_column, + True, + com=self.com, + adjust=self.adjust, ) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 043a41145e5..967edc2ab15 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -2,6 +2,7 @@ from __future__ import annotations import warnings +from typing import TYPE_CHECKING import numba import pandas as pd @@ -16,25 +17,29 @@ from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class _RollingBase: """ - Contains methods common to all kinds of rolling + Contains routines to apply a window aggregation to a column. """ - def _apply_agg_dataframe(self, df, agg_name): - result_df = cudf.DataFrame({}) - for i, col_name in enumerate(df.columns): - result_col = self._apply_agg_series(df[col_name], agg_name) - result_df.insert(i, col_name, result_col) - result_df.index = df.index - return result_df + obj: cudf.DataFrame | cudf.Series - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return self._apply_agg_series(self.obj, agg_name) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + raise NotImplementedError + + def _apply_agg(self, agg_name: str) -> cudf.DataFrame | cudf.Series: + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible): @@ -290,14 +295,6 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg(self, agg_name): - applied = ( - self._apply_agg_column(col, agg_name) for col in self.obj._columns - ) - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self(applied) - ) - def _reduce( self, op: str, diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 0c1cda8810b..c364d55e677 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -965,7 +965,7 @@ def _fast_slow_function_call( except Exception as err: if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): raise ProxyFallbackError( - f"The operation failed with cuDF, the reason was {type(err)}: {err}." + f"The operation failed with cuDF, the reason was {type(err)}: {err}" ) from err with nvtx.annotate( "EXECUTE_SLOW", diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py new file mode 100644 index 00000000000..a009802bab0 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) +def with_nulls(request): + return request.param + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [np.random.randint(100, size=nrows) for _ in key_names] + value = np.random.randint(-100, 100, size=nrows) + df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) + if with_nulls: + for key in key_names: + df.loc[df[key] == 1, key] = None + with cudf.option_context("mode.pandas_compatible", True): + got = df.groupby(key_names, sort=False).agg({"value": "sum"}) + expect = ( + df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) + ) + assert_eq(expect, got, check_index_type=not with_nulls) diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini deleted file mode 100644 index 496a322ff80..00000000000 --- a/python/cudf/cudf/tests/pytest.ini +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -[pytest] -markers = - spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON` -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* - # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore - # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ - ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning - # PerformanceWarning from cupy warming up the JIT cache - ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning - # Ignore numba PEP 456 warning specific to arm machines - ignore:FNV hashing is not implemented in Numba.*:UserWarning -addopts = --tb=native diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index cea86a5499e..691da224f44 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -266,3 +266,25 @@ def test_pandas_compatible_non_zoneinfo_raises(klass): with cudf.option_context("mode.pandas_compatible", True): with pytest.raises(NotImplementedError): cudf.from_pandas(pandas_obj) + + +def test_astype_naive_to_aware_raises(): + ser = cudf.Series([datetime.datetime(2020, 1, 1)]) + with pytest.raises(TypeError): + ser.astype("datetime64[ns, UTC]") + with pytest.raises(TypeError): + ser.to_pandas().astype("datetime64[ns, UTC]") + + +@pytest.mark.parametrize("unit", ["ns", "us"]) +def test_astype_aware_to_aware(unit): + ser = cudf.Series( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)] + ) + result = ser.astype(f"datetime64[{unit}, US/Pacific]") + expected = ser.to_pandas().astype(f"datetime64[{unit}, US/Pacific]") + zoneinfo_type = pd.DatetimeTZDtype( + expected.dtype.unit, zoneinfo.ZoneInfo(str(expected.dtype.tz)) + ) + expected = ser.astype(zoneinfo_type) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4a2345fc009..976b12a9ab5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2525,23 +2525,7 @@ def test_dti_asi8(): @pytest.mark.parametrize( "method, kwargs", - [ - ["mean", {}], - pytest.param( - "std", - {}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - pytest.param( - "std", - {"ddof": 0}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - ], + [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], ) def test_dti_reduction(method, kwargs): pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 4235affd4d1..3adbe1d2a74 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -119,6 +119,15 @@ def test_melt_str_scalar_id_var(): assert_eq(result, expected) +def test_melt_falsy_var_name(): + df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) + result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") + expected = pd.melt( + df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" + ) + assert_eq(result, expected) + + @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index cc88cc79769..45143211a11 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1899,6 +1899,26 @@ def test_string_findall(pat, flags): assert_eq(expected, actual) +@pytest.mark.parametrize( + "pat, flags, pos", + [ + ("Monkey", 0, [-1, 0, -1, -1]), + ("on", 0, [2, 1, -1, 1]), + ("bit", 0, [-1, -1, 3, -1]), + ("on$", 0, [2, -1, -1, -1]), + ("on$", re.MULTILINE, [2, -1, -1, 1]), + ("o.*k", re.DOTALL, [-1, 1, -1, 1]), + ], +) +def test_string_find_re(pat, flags, pos): + test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] + gs = cudf.Series(test_data) + + expected = pd.Series(pos, dtype=np.int32) + actual = gs.str.find_re(pat, flags) + assert_eq(expected, actual) + + def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) gs = cudf.Series(["hello", "goodbye"]) diff --git a/python/cudf/cudf_pandas_tests/pytest.ini b/python/cudf/cudf_pandas_tests/pytest.ini new file mode 100644 index 00000000000..46e2448ea24 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/pytest.ini @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +# Note, this config file overrides the default "cudf" test config in +# ../pyproject.toml We do so deliberately because we have different +# treatment of markers and warnings +[pytest] +addopts = --tb=native --strict-config --strict-markers +empty_parameter_set_mark = fail_at_collect +xfail_strict = true diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 2bbed40e34e..a74b7148c00 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import collections +import contextlib import copy import datetime import operator @@ -21,10 +22,15 @@ import pyarrow as pa import pytest from nbconvert.preprocessors import ExecutePreprocessor -from numba import NumbaDeprecationWarning, vectorize +from numba import ( + NumbaDeprecationWarning, + __version__ as numba_version, + vectorize, +) +from packaging import version from pytz import utc -from cudf.core._compat import PANDAS_GE_220 +from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220, PANDAS_VERSION from cudf.pandas import LOADED, Profiler from cudf.pandas.fast_slow_proxy import ( ProxyFallbackError, @@ -52,8 +58,6 @@ get_calendar, ) -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION - # Accelerated pandas has the real pandas and cudf modules as attributes pd = xpd._fsproxy_slow cudf = xpd._fsproxy_fast @@ -622,10 +626,6 @@ def test_array_function_series_fallback(series): tm.assert_equal(expect, got) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) def test_timedeltaproperties(series): psr, sr = series psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]") @@ -685,10 +685,6 @@ def test_maintain_container_subclasses(multiindex): assert isinstance(got, xpd.core.indexes.frozen.FrozenList) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas due to unsupported boxcar window type", -) def test_rolling_win_type(): pdf = pd.DataFrame(range(5)) df = xpd.DataFrame(range(5)) @@ -697,8 +693,14 @@ def test_rolling_win_type(): tm.assert_equal(result, expected) -@pytest.mark.skip( - reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009" +@pytest.mark.skipif( + version.parse(numba_version) < version.parse("0.59"), + reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009", +) +@pytest.mark.xfail( + version.parse(numba_version) >= version.parse("0.59") + and PANDAS_VERSION < version.parse("2.1"), + reason="numba.generated_jit removed in 0.59, requires pandas >= 2.1", ) def test_rolling_apply_numba_engine(): def weighted_mean(x): @@ -709,7 +711,12 @@ def weighted_mean(x): pdf = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) df = xpd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) - with pytest.warns(NumbaDeprecationWarning): + ctx = ( + contextlib.nullcontext() + if PANDAS_GE_210 + else pytest.warns(NumbaDeprecationWarning) + ) + with ctx: expect = pdf.rolling(2, method="table", min_periods=0).apply( weighted_mean, raw=True, engine="numba" ) @@ -1305,7 +1312,7 @@ def max_times_two(self): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0", ) def test_floordiv_array_vs_df(): @@ -1580,7 +1587,7 @@ def test_numpy_cupy_flatiter(series): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="pyarrow_numpy storage type was not supported in pandas-2.0.0", ) def test_arrow_string_arrays(): diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 605f9be5a49..c0776fd0de6 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "ptxcompiler", "pyarrow>=14.0.0,<18.0.0a0", "pylibcudf==24.12.*,>=0.0.0a0", @@ -124,6 +124,27 @@ skip = [ "__init__.py", ] +[tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*", + # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() + "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore", + # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ + "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning", + # PerformanceWarning from cupy warming up the JIT cache + "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning", + # Ignore numba PEP 456 warning specific to arm machines + "ignore:FNV hashing is not implemented in Numba.*:UserWarning" +] +markers = [ + "spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`" +] +xfail_strict = true + [tool.rapids-build-backend] build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/cudf_kafka/cudf_kafka/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index a1a3ec37842..87e19a2bccf 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -79,9 +79,12 @@ skip = [ ] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error" ] +xfail_strict = true [tool.scikit-build] build-dir = "build/{wheel_tag}" diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index 06bb08953f1..3b1eff4a0d0 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "NamedColumn"] +__all__: list[str] = ["DataFrame", "Column"] -from cudf_polars.containers.column import Column, NamedColumn +from cudf_polars.containers.column import Column from cudf_polars.containers.dataframe import DataFrame diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 3fe3e5557cb..00186098e54 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -15,7 +15,7 @@ import polars as pl -__all__: list[str] = ["Column", "NamedColumn"] +__all__: list[str] = ["Column"] class Column: @@ -26,6 +26,9 @@ class Column: order: plc.types.Order null_order: plc.types.NullOrder is_scalar: bool + # Optional name, only ever set by evaluation of NamedExpr nodes + # The internal evaluation should not care about the name. + name: str | None def __init__( self, @@ -34,14 +37,12 @@ def __init__( is_sorted: plc.types.Sorted = plc.types.Sorted.NO, order: plc.types.Order = plc.types.Order.ASCENDING, null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + name: str | None = None, ): self.obj = column self.is_scalar = self.obj.size() == 1 - if self.obj.size() <= 1: - is_sorted = plc.types.Sorted.YES - self.is_sorted = is_sorted - self.order = order - self.null_order = null_order + self.name = name + self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order) @functools.cached_property def obj_scalar(self) -> plc.Scalar: @@ -63,9 +64,26 @@ def obj_scalar(self) -> plc.Scalar: ) return plc.copying.get_element(self.obj, 0) + def rename(self, name: str | None, /) -> Self: + """ + Return a shallow copy with a new name. + + Parameters + ---------- + name + New name + + Returns + ------- + Shallow copy of self with new name set. + """ + new = self.copy() + new.name = name + return new + def sorted_like(self, like: Column, /) -> Self: """ - Copy sortedness properties from a column onto self. + Return a shallow copy with sortedness from like. Parameters ---------- @@ -74,20 +92,23 @@ def sorted_like(self, like: Column, /) -> Self: Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. See Also -------- set_sorted, copy_metadata """ - return self.set_sorted( - is_sorted=like.is_sorted, order=like.order, null_order=like.null_order + return type(self)( + self.obj, + name=self.name, + is_sorted=like.is_sorted, + order=like.order, + null_order=like.null_order, ) - # TODO: Return Column once #16272 is fixed. - def astype(self, dtype: plc.DataType) -> plc.Column: + def astype(self, dtype: plc.DataType) -> Column: """ - Return the backing column as the requested dtype. + Cast the column to as the requested dtype. Parameters ---------- @@ -109,8 +130,10 @@ def astype(self, dtype: plc.DataType) -> plc.Column: the current one. """ if self.obj.type() != dtype: - return plc.unary.cast(self.obj, dtype) - return self.obj + return Column(plc.unary.cast(self.obj, dtype), name=self.name).sorted_like( + self + ) + return self def copy_metadata(self, from_: pl.Series, /) -> Self: """ @@ -129,6 +152,7 @@ def copy_metadata(self, from_: pl.Series, /) -> Self: -------- set_sorted, sorted_like """ + self.name = from_.name if len(from_) <= 1: return self ascending = from_.flags["SORTED_ASC"] @@ -192,6 +216,7 @@ def copy(self) -> Self: is_sorted=self.is_sorted, order=self.order, null_order=self.null_order, + name=self.name, ) def mask_nans(self) -> Self: @@ -217,58 +242,3 @@ def nan_count(self) -> int: ) ).as_py() return 0 - - -class NamedColumn(Column): - """A column with a name.""" - - name: str - - def __init__( - self, - column: plc.Column, - name: str, - *, - is_sorted: plc.types.Sorted = plc.types.Sorted.NO, - order: plc.types.Order = plc.types.Order.ASCENDING, - null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, - ) -> None: - super().__init__( - column, is_sorted=is_sorted, order=order, null_order=null_order - ) - self.name = name - - def copy(self, *, new_name: str | None = None) -> Self: - """ - A shallow copy of the column. - - Parameters - ---------- - new_name - Optional new name for the copied column. - - Returns - ------- - New column sharing data with self. - """ - return type(self)( - self.obj, - self.name if new_name is None else new_name, - is_sorted=self.is_sorted, - order=self.order, - null_order=self.null_order, - ) - - def mask_nans(self) -> Self: - """Return a shallow copy of self with nans masked out.""" - # Annoying, the inheritance is not right (can't call the - # super-type mask_nans), but will sort that by refactoring - # later. - if plc.traits.is_floating_point(self.obj.type()): - old_count = self.obj.null_count() - mask, new_count = plc.transform.nans_to_nulls(self.obj) - result = type(self)(self.obj.with_mask(mask, new_count), self.name) - if old_count == new_count: - return result.sorted_like(self) - return result - return self.copy() diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index f3e3862d0cc..2c195f6637c 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,43 +5,50 @@ from __future__ import annotations -import itertools from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pyarrow as pa import pylibcudf as plc import polars as pl -from cudf_polars.containers.column import NamedColumn +from cudf_polars.containers import Column from cudf_polars.utils import dtypes if TYPE_CHECKING: - from collections.abc import Mapping, Sequence, Set + from collections.abc import Iterable, Mapping, Sequence, Set from typing_extensions import Self - from cudf_polars.containers import Column - __all__: list[str] = ["DataFrame"] +# Pacify the type checker. DataFrame init asserts that all the columns +# have a string name, so let's narrow the type. +class NamedColumn(Column): + name: str + + class DataFrame: """A representation of a dataframe.""" - columns: list[NamedColumn] + column_map: dict[str, Column] table: plc.Table + columns: list[NamedColumn] - def __init__(self, columns: Sequence[NamedColumn]) -> None: - self.columns = list(columns) - self._column_map = {c.name: c for c in self.columns} - self.table = plc.Table([c.obj for c in columns]) + def __init__(self, columns: Iterable[Column]) -> None: + columns = list(columns) + if any(c.name is None for c in columns): + raise ValueError("All columns must have a name") + self.columns = [cast(NamedColumn, c) for c in columns] + self.column_map = {c.name: c for c in self.columns} + self.table = plc.Table([c.obj for c in self.columns]) def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)([c.copy() for c in self.columns]) + return type(self)(c.copy() for c in self.columns) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" @@ -51,42 +58,38 @@ def to_polars(self) -> pl.DataFrame: # https://github.com/pola-rs/polars/issues/11632 # To guarantee we produce correct names, we therefore # serialise with names we control and rename with that map. - name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} + name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)} table: pa.Table = plc.interop.to_arrow( self.table, [plc.interop.ColumnMetadata(name=name) for name in name_map], ) df: pl.DataFrame = pl.from_arrow(table) return df.rename(name_map).with_columns( - *( - pl.col(c.name).set_sorted( - descending=c.order == plc.types.Order.DESCENDING - ) - if c.is_sorted - else pl.col(c.name) - for c in self.columns - ) + pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING) + if c.is_sorted + else pl.col(c.name) + for c in self.columns ) @cached_property def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" - return frozenset(c.name for c in self.columns) + return frozenset(self.column_map) @cached_property def column_names(self) -> list[str]: """Return a list of the column names.""" - return [c.name for c in self.columns] + return list(self.column_map) @cached_property def num_columns(self) -> int: """Number of columns.""" - return len(self.columns) + return len(self.column_map) @cached_property def num_rows(self) -> int: """Number of rows.""" - return 0 if len(self.columns) == 0 else self.table.num_rows() + return self.table.num_rows() if self.column_map else 0 @classmethod def from_polars(cls, df: pl.DataFrame) -> Self: @@ -111,12 +114,8 @@ def from_polars(cls, df: pl.DataFrame) -> Self: # No-op if the schema is unchanged. d_table = plc.interop.from_arrow(table.cast(schema)) return cls( - [ - NamedColumn(column, h_col.name).copy_metadata(h_col) - for column, h_col in zip( - d_table.columns(), df.iter_columns(), strict=True - ) - ] + Column(column).copy_metadata(h_col) + for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True) ) @classmethod @@ -144,17 +143,14 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") return cls( - [ - NamedColumn(c, name) - for c, name in zip(table.columns(), names, strict=True) - ] + Column(c, name=name) for c, name in zip(table.columns(), names, strict=True) ) def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None ) -> Self: """ - Copy sortedness from a dataframe onto self. + Return a shallow copy with sortedness copied from like. Parameters ---------- @@ -165,7 +161,7 @@ def sorted_like( Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. Raises ------ @@ -175,13 +171,12 @@ def sorted_like( if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset - self.columns = [ + return type(self)( c.sorted_like(other) if c.name in subset else c for c, other in zip(self.columns, like.columns, strict=True) - ] - return self + ) - def with_columns(self, columns: Sequence[NamedColumn]) -> Self: + def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self: """ Return a new dataframe with extra columns. @@ -189,6 +184,8 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ---------- columns Columns to add + replace_only + If true, then only replacements are allowed (matching by name). Returns ------- @@ -196,36 +193,30 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: Notes ----- - If column names overlap, newer names replace older ones. + If column names overlap, newer names replace older ones, and + appear in the same order as the original frame. """ - columns = list( - {c.name: c for c in itertools.chain(self.columns, columns)}.values() - ) - return type(self)(columns) + new = {c.name: c for c in columns} + if replace_only and not self.column_names_set.issuperset(new.keys()): + raise ValueError("Cannot replace with non-existing names") + return type(self)((self.column_map | new).values()) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" - return type(self)([c for c in self.columns if c.name not in names]) + return type(self)(column for column in self.columns if column.name not in names) def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" - want = set(names) - if not want.issubset(self.column_names_set): - raise ValueError("Can't select missing names") - return type(self)([self._column_map[name] for name in names]) - - def replace_columns(self, *columns: NamedColumn) -> Self: - """Return a new dataframe with columns replaced by name.""" - new = {c.name: c for c in columns} - if not set(new).issubset(self.column_names_set): - raise ValueError("Cannot replace with non-existing names") - return type(self)([new.get(c.name, c) for c in self.columns]) + try: + return type(self)(self.column_map[name] for name in names) + except KeyError as e: + raise ValueError("Can't select missing names") from e def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" - return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns]) + return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns) - def select_columns(self, names: Set[str]) -> list[NamedColumn]: + def select_columns(self, names: Set[str]) -> list[Column]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c401e5a2f17..f7775ceb238 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -27,7 +27,7 @@ from polars.exceptions import InvalidOperationError from polars.polars import _expr_nodes as pl_expr -from cudf_polars.containers import Column, NamedColumn +from cudf_polars.containers import Column from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: @@ -313,7 +313,7 @@ def evaluate( *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, - ) -> NamedColumn: + ) -> Column: """ Evaluate this expression given a dataframe for context. @@ -328,20 +328,15 @@ def evaluate( Returns ------- - NamedColumn attaching a name to an evaluated Column + Evaluated Column with name attached. See Also -------- :meth:`Expr.evaluate` for details, this function just adds the name to a column produced from an expression. """ - obj = self.value.evaluate(df, context=context, mapping=mapping) - return NamedColumn( - obj.obj, - self.name, - is_sorted=obj.is_sorted, - order=obj.order, - null_order=obj.null_order, + return self.value.evaluate(df, context=context, mapping=mapping).rename( + self.name ) def collect_agg(self, *, depth: int) -> AggInfo: @@ -428,7 +423,9 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - return df._column_map[self.name] + # Deliberately remove the name here so that we guarantee + # evaluation of the IR produces names. + return df.column_map[self.name].rename(None) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -914,7 +911,7 @@ def do_evaluate( col = self.children[0].evaluate(df, context=context, mapping=mapping) is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( - col.obj, format.encode() + col.obj, format ) if strict: @@ -937,7 +934,7 @@ def do_evaluate( ) return Column( plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype, format.encode() + res.columns()[0], self.dtype, format ) ) elif self.name == pl_expr.StringFunction.Replace: @@ -961,16 +958,16 @@ def do_evaluate( class TemporalFunction(Expr): __slots__ = ("name", "options", "children") _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { - pl_expr.TemporalFunction.Year: "year", - pl_expr.TemporalFunction.Month: "month", - pl_expr.TemporalFunction.Day: "day", - pl_expr.TemporalFunction.WeekDay: "weekday", - pl_expr.TemporalFunction.Hour: "hour", - pl_expr.TemporalFunction.Minute: "minute", - pl_expr.TemporalFunction.Second: "second", - pl_expr.TemporalFunction.Millisecond: "millisecond", - pl_expr.TemporalFunction.Microsecond: "microsecond", - pl_expr.TemporalFunction.Nanosecond: "nanosecond", + pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, + pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, + pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, + pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, + pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR, + pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE, + pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND, + pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, + pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, + pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, } _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -1003,8 +1000,12 @@ def do_evaluate( ] (column,) = columns if self.name == pl_expr.TemporalFunction.Microsecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) millis_as_micros = plc.binaryop.binary_operation( millis, plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), @@ -1019,9 +1020,15 @@ def do_evaluate( ) return Column(total_micros) elif self.name == pl_expr.TemporalFunction.Nanosecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") - nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) + nanos = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.NANOSECOND + ) millis_as_nanos = plc.binaryop.binary_operation( millis, plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 8cd56c8ee3a..e319c363a23 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -26,7 +26,7 @@ import polars as pl import cudf_polars.dsl.expr as expr -from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.containers import Column, DataFrame from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: @@ -57,9 +57,7 @@ ] -def broadcast( - *columns: NamedColumn, target_length: int | None = None -) -> list[NamedColumn]: +def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]: """ Broadcast a sequence of columns to a common length. @@ -112,12 +110,12 @@ def broadcast( return [ column if column.obj.size() != 1 - else NamedColumn( + else Column( plc.Column.from_scalar(column.obj_scalar, nrows), - column.name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE, + name=column.name, ) for column in columns ] @@ -385,15 +383,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: init = plc.interop.from_arrow( pa.scalar(offset, type=plc.interop.to_arrow(dtype)) ) - index = NamedColumn( + index = Column( plc.filling.sequence(df.num_rows, init, step), - name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, + name=name, ) df = DataFrame([index, *df.columns]) - assert all(c.obj.type() == self.schema[c.name] for c in df.columns) + assert all( + c.obj.type() == self.schema[name] for name, c in df.column_map.items() + ) if self.predicate is None: return df else: @@ -588,44 +588,58 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: requests.append(plc.groupby.GroupByRequest(col, [req])) replacements.append(rep) group_keys, raw_tables = grouper.aggregate(requests) - # TODO: names - raw_columns: list[NamedColumn] = [] + raw_columns: list[Column] = [] for i, table in enumerate(raw_tables): (column,) = table.columns() - raw_columns.append(NamedColumn(column, f"tmp{i}")) + raw_columns.append(Column(column, name=f"tmp{i}")) mapping = dict(zip(replacements, raw_columns, strict=True)) result_keys = [ - NamedColumn(gk, k.name) - for gk, k in zip(group_keys.columns(), keys, strict=True) + Column(grouped_key, name=key.name) + for key, grouped_key in zip(keys, group_keys.columns(), strict=True) ] result_subs = DataFrame(raw_columns) results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] broadcasted = broadcast(*result_keys, *results) - result_keys = broadcasted[: len(result_keys)] - results = broadcasted[len(result_keys) :] # Handle order preservation of groups - # like cudf classic does - # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743 if self.maintain_order and not sorted: - left = plc.stream_compaction.stable_distinct( + # The order we want + want = plc.stream_compaction.stable_distinct( plc.Table([k.obj for k in keys]), list(range(group_keys.num_columns())), plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) - right = plc.Table([key.obj for key in result_keys]) - _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL) + # The order we have + have = plc.Table([key.obj for key in broadcasted[: len(keys)]]) + + # We know an inner join is OK because by construction + # want and have are permutations of each other. + left_order, right_order = plc.join.inner_join( + want, have, plc.types.NullEquality.EQUAL + ) + # Now left_order is an arbitrary permutation of the ordering we + # want, and right_order is a matching permutation of the ordering + # we have. To get to the original ordering, we need + # left_order == iota(nrows), with right_order permuted + # appropriately. This can be obtained by sorting + # right_order by left_order. + (right_order,) = plc.sorting.sort_by_key( + plc.Table([right_order]), + plc.Table([left_order]), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER], + ).columns() ordered_table = plc.copying.gather( plc.Table([col.obj for col in broadcasted]), - indices, + right_order, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) broadcasted = [ - NamedColumn(reordered, b.name) - for reordered, b in zip( + Column(reordered, name=old.name) + for reordered, old in zip( ordered_table.columns(), broadcasted, strict=True ) ] @@ -772,20 +786,20 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # result, not the gather maps columns = plc.join.cross_join(left.table, right.table).columns() left_cols = [ - NamedColumn(new, old.name).sorted_like(old) + Column(new, name=old.name).sorted_like(old) for new, old in zip( columns[: left.num_columns], left.columns, strict=True ) ] right_cols = [ - NamedColumn( + Column( new, - old.name - if old.name not in left.column_names_set - else f"{old.name}{suffix}", + name=name + if name not in left.column_names_set + else f"{name}{suffix}", ) - for new, old in zip( - columns[left.num_columns :], right.columns, strict=True + for new, name in zip( + columns[left.num_columns :], right.column_names, strict=True ) ] return DataFrame([*left_cols, *right_cols]) @@ -823,18 +837,19 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.copying.gather(right.table, rg, right_policy), right.column_names ) if coalesce and how != "inner": - left = left.replace_columns( - *( - NamedColumn( + left = left.with_columns( + ( + Column( plc.replace.replace_nulls(left_col.obj, right_col.obj), - left_col.name, + name=left_col.name, ) for left_col, right_col in zip( left.select_columns(left_on.column_names_set), right.select_columns(right_on.column_names_set), strict=True, ) - ) + ), + replace_only=True, ) right = right.discard_columns(right_on.column_names_set) if how == "right": @@ -916,9 +931,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df = self.df.evaluate(cache=cache) if self.subset is None: indices = list(range(df.num_columns)) + keys_sorted = all(c.is_sorted for c in df.column_map.values()) else: indices = [i for i, k in enumerate(df.column_names) if k in self.subset] - keys_sorted = all(df.columns[i].is_sorted for i in indices) + keys_sorted = all(df.column_map[name].is_sorted for name in self.subset) if keys_sorted: table = plc.stream_compaction.unique( df.table, @@ -939,10 +955,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) + # TODO: Is this sortedness setting correct result = DataFrame( [ - NamedColumn(c, old.name).sorted_like(old) - for c, old in zip(table.columns(), df.columns, strict=True) + Column(new, name=old.name).sorted_like(old) + for new, old in zip(table.columns(), df.columns, strict=True) ] ) if keys_sorted or self.stable: @@ -993,30 +1010,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: sort_keys = broadcast( *(k.evaluate(df) for k in self.by), target_length=df.num_rows ) - names = {c.name: i for i, c in enumerate(df.columns)} # TODO: More robust identification here. - keys_in_result = [ - i - for k in sort_keys - if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj - ] + keys_in_result = { + k.name: i + for i, k in enumerate(sort_keys) + if k.name in df.column_map and k.obj is df.column_map[k.name].obj + } table = self.do_sort( df.table, plc.Table([k.obj for k in sort_keys]), self.order, self.null_order, ) - columns = [ - NamedColumn(c, old.name) - for c, old in zip(table.columns(), df.columns, strict=True) - ] - # If a sort key is in the result table, set the sortedness property - for k, i in enumerate(keys_in_result): - columns[i] = columns[i].set_sorted( - is_sorted=plc.types.Sorted.YES, - order=self.order[k], - null_order=self.null_order[k], - ) + columns: list[Column] = [] + for name, c in zip(df.column_map, table.columns(), strict=True): + column = Column(c, name=name) + # If a sort key is in the result table, set the sortedness property + if name in keys_in_result: + i = keys_in_result[name] + column = column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=self.order[i], + null_order=self.null_order[i], + ) + columns.append(column) return DataFrame(columns).slice(self.zlice) @@ -1065,7 +1082,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df = self.df.evaluate(cache=cache) # This can reorder things. columns = broadcast( - *df.select(list(self.schema.keys())).columns, target_length=df.num_rows + *(df.column_map[name] for name in self.schema), target_length=df.num_rows ) return DataFrame(columns) @@ -1110,7 +1127,7 @@ def __post_init__(self) -> None: old, new, _ = self.options # TODO: perhaps polars should validate renaming in the IR? if len(new) != len(set(new)) or ( - set(new) & (set(self.df.schema.keys() - set(old))) + set(new) & (set(self.df.schema.keys()) - set(old)) ): raise NotImplementedError("Duplicate new names in rename.") elif self.name == "unpivot": @@ -1155,7 +1172,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: npiv = len(pivotees) df = self.df.evaluate(cache=cache) index_columns = [ - NamedColumn(col, name) + Column(col, name=name) for col, name in zip( plc.reshape.tile(df.select(indices).table, npiv).columns(), indices, @@ -1176,13 +1193,16 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df.num_rows, ).columns() value_column = plc.concatenate.concatenate( - [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + [ + df.column_map[pivotee].astype(self.schema[value_name]).obj + for pivotee in pivotees + ] ) return DataFrame( [ *index_columns, - NamedColumn(variable_column, variable_name), - NamedColumn(value_column, value_name), + Column(variable_column, name=variable_name), + Column(value_column, name=value_name), ] ) else: @@ -1263,6 +1283,4 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) for df in dfs ] - return DataFrame( - list(itertools.chain.from_iterable(df.columns for df in dfs)), - ) + return DataFrame(itertools.chain.from_iterable(df.columns for df in dfs)) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index bff44af1468..7837a275f20 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -201,21 +201,21 @@ the logical plan in any case, so is reasonably natural. # Containers Containers should be constructed as relatively lightweight objects -around their pylibcudf counterparts. We have four (in +around their pylibcudf counterparts. We have three (in `cudf_polars/containers/`): 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` (a `Column` with an additional name) -4. `DataFrame` (a wrapper around a pylibcudf `Table`) +3. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly -speaking, a `DataFrame` is just a list of `NamedColumn`s which each -hold a `Column` plus a string `name`. `NamedColumn`s are only ever -constructed via `NamedExpr`s, which are the top-level expression node -that lives inside an `IR` node. This means that the expression -evaluator never has to concern itself with column names: columns are -only ever decorated with names when constructing a `DataFrame`. +speaking, a `DataFrame` is just a mapping from string `name`s to +`Column`s, and thus also holds a pylibcudf `Table`. Names are only +attached to `Column`s and hence inserted into `DataFrames` via +`NamedExpr`s, which are the top-level expression nodes that live +inside an `IR` node. This means that the expression evaluator never +has to concern itself with column names: columns are only ever +decorated with names when constructing a `DataFrame`. The columns keep track of metadata (for example, whether or not they are sorted). We could imagine tracking more metadata, like minimum and diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f55031e0826..5345fad41a2 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -50,6 +50,11 @@ license-files = ["LICENSE"] version = {file = "cudf_polars/VERSION"} [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error" +] xfail_strict = true [tool.coverage.report] diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 19919877f84..1f26ab1af9f 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -3,13 +3,11 @@ from __future__ import annotations -from functools import partial - import pyarrow import pylibcudf as plc import pytest -from cudf_polars.containers import Column, NamedColumn +from cudf_polars.containers import Column def test_non_scalar_access_raises(): @@ -55,11 +53,10 @@ def test_shallow_copy(): @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32]) -@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")]) -def test_mask_nans(typeid, constructor): +def test_mask_nans(typeid): dtype = plc.DataType(typeid) values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) - column = constructor(plc.interop.from_arrow(values)) + column = Column(plc.interop.from_arrow(values)) masked = column.mask_nans() assert column.obj.null_count() == masked.obj.null_count() diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 39fb44d55a5..5c68fb8f0aa 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -8,18 +8,18 @@ import polars as pl -from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.containers import Column, DataFrame from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -30,17 +30,17 @@ def test_select_missing_raises(): def test_replace_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) - replacement = df.columns[0].copy(new_name="b") + replacement = df.column_map["a"].copy().rename("b") with pytest.raises(ValueError): - df.replace_columns(replacement) + df.with_columns([replacement], replace_only=True) def test_from_table_wrong_names(): @@ -55,14 +55,23 @@ def test_from_table_wrong_names(): DataFrame.from_table(table, ["a", "b"]) +def test_unnamed_column_raise(): + payload = plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 0, plc.MaskState.ALL_VALID + ) + + with pytest.raises(ValueError): + DataFrame([Column(payload, name="a"), Column(payload)]) + + def test_sorted_like_raises_mismatching_names(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -72,11 +81,11 @@ def test_sorted_like_raises_mismatching_names(): def test_shallow_copy(): - column = NamedColumn( + column = Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) column.set_sorted( is_sorted=plc.types.Sorted.YES, @@ -85,13 +94,13 @@ def test_shallow_copy(): ) df = DataFrame([column]) copy = df.copy() - copy.columns[0].set_sorted( + copy.column_map["a"].set_sorted( is_sorted=plc.types.Sorted.NO, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, ) - assert df.columns[0].is_sorted == plc.types.Sorted.YES - assert copy.columns[0].is_sorted == plc.types.Sorted.NO + assert df.column_map["a"].is_sorted == plc.types.Sorted.YES + assert copy.column_map["a"].is_sorted == plc.types.Sorted.NO def test_sorted_flags_preserved_empty(): @@ -100,7 +109,7 @@ def test_sorted_flags_preserved_empty(): gf = DataFrame.from_polars(df) - (a,) = gf.columns + a = gf.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 76c7648813a..2a37683478b 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -69,7 +69,7 @@ def test_setsorted(descending, nulls_last, with_nulls): df = translate_ir(q._ldf.visit()).evaluate(cache={}) - (a,) = df.columns + a = df.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES null_order = ( diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/cudf_polars/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 74bf8b9e4e2..1e8246496cd 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -4,6 +4,7 @@ import itertools +import numpy as np import pytest import polars as pl @@ -191,3 +192,24 @@ def test_groupby_literal_in_agg(df, key, expr): def test_groupby_unary_non_pointwise_raises(df, expr): q = df.group_by("key1").agg(expr) assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [np.random.randint(100, size=nrows) for _ in key_names] + value = np.random.randint(-100, 100, size=nrows) + df = pl.DataFrame(dict(zip(key_names, key_values, strict=True), value=value)) + if with_nulls: + df = df.with_columns( + *( + pl.when(pl.col(name) == 1) + .then(None) + .otherwise(pl.col(name)) + .alias(name) + for name in key_names + ) + ) + q = df.lazy().group_by(key_names, maintain_order=True).agg(pl.col("value").sum()) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py index 35aaef44e1f..e7770bfadac 100644 --- a/python/cudf_polars/tests/utils/test_broadcast.py +++ b/python/cudf_polars/tests/utils/test_broadcast.py @@ -6,34 +6,35 @@ import pylibcudf as plc import pytest -from cudf_polars.containers import NamedColumn +from cudf_polars.containers import Column from cudf_polars.dsl.ir import broadcast @pytest.mark.parametrize("target", [4, None]) def test_broadcast_all_scalar(target): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns, target_length=target) expected = 1 if target is None else target + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == expected for column in result) def test_invalid_target_length(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -43,11 +44,11 @@ def test_invalid_target_length(): def test_broadcast_mismatching_column_lengths(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -58,16 +59,17 @@ def test_broadcast_mismatching_column_lengths(): @pytest.mark.parametrize("nrows", [0, 5]) def test_broadcast_with_scalars(nrows): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), nrows if i == 0 else 1, plc.MaskState.ALL_VALID, ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns) + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == nrows for column in result) diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index bae4b051cae..8c0130d2818 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -377,24 +377,16 @@ def test_setitem_overwrites(stream): [ ({}, "sum"), ({}, "mean"), - pytest.param({}, "min"), + ({}, "min"), pytest.param( {}, "median", marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), ), - pytest.param({}, "max"), - pytest.param( - {}, - "var", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), - pytest.param({}, "count"), - pytest.param( - {"ddof": 0}, - "std", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), + ({}, "max"), + ({}, "var"), + ({}, "count"), + ({"ddof": 0}, "std"), pytest.param( {"quantile": 0.5}, "quantile", diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 85ab0024bb5..af45f49d9b4 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -111,6 +111,8 @@ skip = [ ] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", "ignore:unclosed =12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "rapids-dask-dependency==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -115,6 +115,8 @@ skip = [ ] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error::FutureWarning", "error::DeprecationWarning", @@ -125,3 +127,4 @@ filterwarnings = [ "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", ] +xfail_strict = true diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 0a8f5c4807d..5f9a04d3cee 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -22,6 +22,8 @@ project( LANGUAGES CXX ) +option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF) + # Check if cudf is already available. If so, it is the user's responsibility to ensure that the # CMake package is also available at build time of the Python cudf package. find_package(cudf "${RAPIDS_VERSION}") @@ -39,14 +41,20 @@ set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(CUDF_EXPORT_NVCOMP OFF) +endif() set(CUDA_STATIC_RUNTIME ON) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) add_subdirectory(../../cpp cudf-cpp) -# Ensure other libraries needed by libcudf.so get installed alongside it. -include(cmake/Modules/WheelHelpers.cmake) -install_aliased_imported_targets( - TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} -) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(rpaths "$ORIGIN/../../nvidia/nvcomp") + set_property( + TARGET cudf + PROPERTY INSTALL_RPATH ${rpaths} + APPEND + ) +endif() diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 5bffe9fd96c..84660cbc276 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -37,6 +37,9 @@ classifiers = [ "Programming Language :: C++", "Environment :: GPU :: NVIDIA CUDA", ] +dependencies = [ + "nvidia-nvcomp==4.0.1", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] Homepage = "https://github.com/rapidsai/cudf" diff --git a/python/pylibcudf/LICENSE b/python/pylibcudf/LICENSE new file mode 120000 index 00000000000..30cff7403da --- /dev/null +++ b/python/pylibcudf/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a7cb66d7b16..1d72eacac12 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -66,3 +66,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow) add_subdirectory(libcudf) add_subdirectory(strings) add_subdirectory(io) +add_subdirectory(nvtext) diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index a384edd456d..b98b37fe0fd 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -78,4 +79,5 @@ __all__ = [ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 2a5365e8fad..304f27be340 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -28,6 +28,7 @@ lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -92,4 +93,5 @@ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd index 2fce48cf1b4..72ce680ba7a 100644 --- a/python/pylibcudf/pylibcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/datetime.pxd @@ -1,8 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.libcudf.datetime cimport datetime_component + from .column cimport Column cpdef Column extract_year( Column col ) + +cpdef Column extract_datetime_component( + Column col, + datetime_component component +) diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index e8e0caaf42d..784d29128bf 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -3,19 +3,14 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.datetime cimport ( - day_of_year as cpp_day_of_year, - extract_day as cpp_extract_day, - extract_hour as cpp_extract_hour, - extract_microsecond_fraction as cpp_extract_microsecond_fraction, - extract_millisecond_fraction as cpp_extract_millisecond_fraction, - extract_minute as cpp_extract_minute, - extract_month as cpp_extract_month, - extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, - extract_second as cpp_extract_second, - extract_weekday as cpp_extract_weekday, + datetime_component, + extract_datetime_component as cpp_extract_datetime_component, extract_year as cpp_extract_year, ) +from pylibcudf.libcudf.datetime import \ + datetime_component as DatetimeComponent # no-cython-lint + from .column cimport Column @@ -41,41 +36,29 @@ cpdef Column extract_year( result = move(cpp_extract_year(values.view())) return Column.from_libcudf(move(result)) +cpdef Column extract_datetime_component( + Column values, + datetime_component component +): + """ + Extract a datetime component from a datetime column. -def extract_datetime_component(Column col, str field): + For details, see :cpp:func:`cudf::extract_datetime_component`. - cdef unique_ptr[column] c_result + Parameters + ---------- + values : Column + The column to extract the component from. + component : DatetimeComponent + The datetime component to extract. - with nogil: - if field == "year": - c_result = move(cpp_extract_year(col.view())) - elif field == "month": - c_result = move(cpp_extract_month(col.view())) - elif field == "day": - c_result = move(cpp_extract_day(col.view())) - elif field == "weekday": - c_result = move(cpp_extract_weekday(col.view())) - elif field == "hour": - c_result = move(cpp_extract_hour(col.view())) - elif field == "minute": - c_result = move(cpp_extract_minute(col.view())) - elif field == "second": - c_result = move(cpp_extract_second(col.view())) - elif field == "millisecond": - c_result = move( - cpp_extract_millisecond_fraction(col.view()) - ) - elif field == "microsecond": - c_result = move( - cpp_extract_microsecond_fraction(col.view()) - ) - elif field == "nanosecond": - c_result = move( - cpp_extract_nanosecond_fraction(col.view()) - ) - elif field == "day_of_year": - c_result = move(cpp_day_of_year(col.view())) - else: - raise ValueError(f"Invalid datetime field: '{field}'") + Returns + ------- + Column + Column with the extracted component. + """ + cdef unique_ptr[column] result - return Column.from_libcudf(move(c_result)) + with nogil: + result = move(cpp_extract_datetime_component(values.view(), component)) + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 2167616690f..15beaee47d4 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx - replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx +set(cython_sources + aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx labeling.pyx reduce.pyx + replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd index a4465343197..73cdfb96af5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint8_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -7,6 +8,18 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: + cpdef enum class datetime_component(uint8_t): + YEAR + MONTH + DAY + WEEKDAY + HOUR + MINUTE + SECOND + MILLISECOND + MICROSECOND + NANOSECOND + cdef unique_ptr[column] extract_year(const column_view& column) except + cdef unique_ptr[column] extract_month(const column_view& column) except + cdef unique_ptr[column] extract_day(const column_view& column) except + @@ -23,6 +36,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_nanosecond_fraction( const column_view& column ) except + + cdef unique_ptr[column] extract_datetime_component( + const column_view& column, + datetime_component component + ) except + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": DAY "cudf::datetime::rounding_frequency::DAY" diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pyx b/python/pylibcudf/pylibcudf/libcudf/datetime.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index abf4357f862..b8b4343173e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) +set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd index 5d54c1c3593..76afe047e8c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd @@ -22,9 +22,6 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \ CASE_TYPES ALL_TYPES -cdef extern from "cudf/strings/char_types/char_types.hpp" \ - namespace "cudf::strings" nogil: - cdef unique_ptr[column] all_characters_of_type( column_view source_strings, string_character_types types, diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 83a9573baad..e6688cfff81 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( - column_view input_col, + column_view input, string_scalar true_string) except + cdef unique_ptr[column] from_booleans( - column_view input_col, + column_view booleans, string_scalar true_string, string_scalar false_string) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index fa8975c4df9..fceddd58df0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( - column_view input_col, + column_view input, data_type timestamp_type, string format) except + cdef unique_ptr[column] from_timestamps( - column_view input_col, + column_view timestamps, string format, - column_view input_strings_names) except + + column_view names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd index ebe10574353..43ffad1d89f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd @@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_durations( - const column_view & strings_col, + const column_view & input, data_type duration_type, const string & format) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd index 6f820f3c9a4..72ab329f2dd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd @@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_fixed_point( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] from_fixed_point( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] is_fixed_point( - column_view source_strings, - data_type output_type + column_view input, + data_type decimal_type ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd index fe571cfced6..801db438e92 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd @@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] ipv4_to_integers( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] integers_to_ipv4( - column_view input_col) except + + column_view integers) except + cdef unique_ptr[column] is_ipv4( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd index 0491644a10a..3d048c1f50b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] find_multiple( - column_view source_strings, + column_view input, column_view targets) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index e0a8b776465..0d286c36446 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -11,3 +11,7 @@ cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( column_view input, regex_program prog) except + + + cdef unique_ptr[column] find_re( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd index 657fe61eb14..875f8cafd14 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd @@ -12,11 +12,11 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] pad( - column_view source_strings, + column_view input, size_type width, side_type side, string fill_char) except + cdef unique_ptr[column] zfill( - column_view source_strings, + column_view input, size_type width) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 019ff3f17ba..e92c5dc1d66 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,12 +1,10 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. -from libc.stdint cimport int32_t +from libcpp cimport int cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - cpdef enum class side_type(int32_t): - LEFT 'cudf::strings::side_type::LEFT' - RIGHT 'cudf::strings::side_type::RIGHT' - BOTH 'cudf::strings::side_type::BOTH' - -ctypedef int32_t underlying_type_t_side_type + cpdef enum class side_type(int): + LEFT + RIGHT + BOTH diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd index 4162e886a7d..4299cf62e99 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd @@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] partition( - column_view source_strings, + column_view input, string_scalar delimiter) except + cdef unique_ptr[table] rpartition( - column_view source_strings, + column_view input, string_scalar delimiter) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd index 3046149aebb..a22a79fc7d7 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd @@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[table] rsplit( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] split_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + @@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[table] rsplit_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] split_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd index b0ca771762d..dd527a78e7f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd @@ -10,6 +10,6 @@ from pylibcudf.libcudf.strings.side_type cimport side_type cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] strip( - column_view source_strings, - side_type stype, + column_view input, + side_type side, string_scalar to_strip) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd index 85fa719128a..9fd24f2987b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd @@ -13,15 +13,15 @@ from pylibcudf.libcudf.types cimport char_utf8 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] translate( - column_view source_strings, + column_view input, vector[pair[char_utf8, char_utf8]] chars_table) except + - ctypedef enum filter_type: - KEEP 'cudf::strings::filter_type::KEEP', - REMOVE 'cudf::strings::filter_type::REMOVE' + cpdef enum class filter_type(bool): + KEEP + REMOVE cdef unique_ptr[column] filter_characters( - column_view source_strings, - vector[pair[char_utf8, char_utf8]] chars_table, - filter_type keep, + column_view input, + vector[pair[char_utf8, char_utf8]] characters_to_filter, + filter_type keep_characters, string_scalar replacement) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd index c0053391328..abc1bd43ad2 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd @@ -9,5 +9,5 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] wrap( - column_view source_strings, + column_view input, size_type width) except + diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt new file mode 100644 index 00000000000..eb5617a1da6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources edit_distance.pyx generate_ngrams.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd new file mode 100644 index 00000000000..7f5fa2b9925 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport edit_distance, generate_ngrams + +__all__ = [ + "edit_distance", + "generate_ngrams", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py new file mode 100644 index 00000000000..a66ce984745 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import edit_distance, generate_ngrams + +__all__ = [ + "edit_distance", + "generate_ngrams", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd new file mode 100644 index 00000000000..446b95afabb --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column edit_distance(Column input, Column targets) + +cpdef Column edit_distance_matrix(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx new file mode 100644 index 00000000000..fc98ccbc50c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.edit_distance cimport ( + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix, +) + + +cpdef Column edit_distance(Column input, Column targets): + """ + Returns the edit distance between individual strings in two strings columns + + For details, see :cpp:func:`edit_distance` + + Parameters + ---------- + input : Column + Input strings + targets : Column + Strings to compute edit distance against + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef column_view c_targets = targets.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance(c_strings, c_targets)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column edit_distance_matrix(Column input): + """ + Returns the edit distance between all strings in the input strings column + + For details, see :cpp:func:`edit_distance_matrix` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance_matrix(c_strings)) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd new file mode 100644 index 00000000000..f15eb1f25e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator) + +cpdef Column generate_character_ngrams(Column input, size_type ngrams=*) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx new file mode 100644 index 00000000000..8c7a8edc01d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -0,0 +1,111 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( + generate_character_ngrams as cpp_generate_character_ngrams, + generate_ngrams as cpp_generate_ngrams, + hash_character_ngrams as cpp_hash_character_ngrams, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): + """ + Returns a single column of strings by generating ngrams from a strings column. + + For details, see :cpp:func:`generate_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef column_view c_strings = input.view() + cdef const string_scalar* c_separator = separator.c_obj.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_ngrams( + c_strings, + ngrams, + c_separator[0] + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of ngrams of characters within each string. + + For details, see :cpp:func:`generate_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of strings + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of hash values of the characters in each string + + For details, see :cpp:func:`hash_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of hash values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_hash_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 142bc124ca2..eeb44d19333 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -20,7 +20,9 @@ set(cython_sources contains.pyx extract.pyx find.pyx + find_multiple.pyx findall.pyx + padding.pyx regex_flags.pyx regex_program.pyx repeat.pyx @@ -28,6 +30,8 @@ set(cython_sources side_type.pyx slice.pyx strip.pyx + translate.pyx + wrap.pyx ) set(linked_libraries cudf::cudf) @@ -38,3 +42,4 @@ rapids_cython_create_modules( ) add_subdirectory(convert) +add_subdirectory(split) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d8afccc7336..187ef113073 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -9,12 +9,18 @@ from . cimport ( convert, extract, find, + find_multiple, findall, + padding, regex_flags, regex_program, replace, + side_type, slice, + split, strip, + translate, + wrap, ) from .side_type cimport side_type @@ -33,5 +39,8 @@ __all__ = [ "replace", "slice", "strip", + "split", "side_type", + "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 22452812e42..6033cea0625 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -9,13 +9,19 @@ convert, extract, find, + find_multiple, findall, + padding, regex_flags, regex_program, repeat, replace, + side_type, slice, + split, strip, + translate, + wrap, ) from .side_type import SideType @@ -34,5 +40,8 @@ "replace", "slice", "strip", + "split", "SideType", + "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd index ad4e4cf61d8..f9f7d244212 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd @@ -1,3 +1,19 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column from pylibcudf.libcudf.strings.char_types cimport string_character_types +from pylibcudf.scalar cimport Scalar + + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +) diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index e7621fb4d84..6a24d79bc4b 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -1,4 +1,93 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport char_types as cpp_char_types +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference from pylibcudf.libcudf.strings.char_types import \ string_character_types as StringCharacterTypes # no-cython-lint + + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +): + """ + Identifies strings where all characters match the specified type. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types : StringCharacterTypes + The character types to check in each string + verify_types : StringCharacterTypes + Only verify against these character types. + + Returns + ------- + Column + New column of boolean results for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_char_types.all_characters_of_type( + source_strings.view(), + types, + verify_types, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +): + """ + Filter specific character types from a column of strings. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types_to_remove : StringCharacterTypes + The character types to check in each string. + replacement : Scalar + The replacement character to use when removing characters + types_to_keep : StringCharacterTypes + Default `ALL_TYPES` means all characters of `types_to_remove` + will be filtered. + + Returns + ------- + Column + New column with the specified characters filtered out and + replaced with the specified replacement string. + """ + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_char_types.filter_characters_of_type( + source_strings.view(), + types_to_remove, + dereference(c_replacement), + types_to_keep, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..eb0d6ee6999 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx + convert_fixed_point.pyx convert_ipv4.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..431beed8e5d 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_ipv4, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..a601b562c2e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_ipv4, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd new file mode 100644 index 00000000000..312ac3c0ca0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column to_booleans(Column input, Scalar true_string) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx new file mode 100644 index 00000000000..0c10f821ab6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.convert cimport ( + convert_booleans as cpp_convert_booleans, +) +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column to_booleans(Column input, Scalar true_string): + """ + Returns a new bool column by parsing boolean values from the strings + in the provided strings column. + + For details, see :cpp:func:`cudf::strings::to_booleans`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + true_string : Scalar + String to expect for true. Non-matching strings are false + + Returns + ------- + Column + New bool column converted from strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): + """ + Returns a new strings column converting the boolean values from the + provided column into strings. + + For details, see :cpp:func:`cudf::strings::from_booleans`. + + Parameters + ---------- + booleans : Column + Boolean column to convert. + + true_string : Scalar + String to use for true in the output column. + + false_string : Scalar + String to use for false in the output column. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + cdef const string_scalar* c_false_string = ( + false_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index 07c84d263d6..80ec168644b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ) + +cpdef Column is_timestamp( + Column input, + str format, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index fcacb096f87..0ee60812e00 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -15,28 +15,74 @@ from pylibcudf.types import DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ): + """ + Returns a new timestamp column converting a strings column into + timestamps using the provided format pattern. + + For details, see cpp:`cudf::strings::to_timestamps`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + timestamp_type : DataType + The timestamp type used for creating the output column. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New datetime column + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ): + """ + Returns a new strings column converting a timestamp column into + strings using the provided format pattern. + + For details, see cpp:`cudf::strings::from_timestamps`. + + Parameters + ---------- + timestamps : Column + Timestamp values to convert + + format : str + The string specifying output format. + + input_strings_names : Column + The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + + Returns + ------- + Column + New strings column with formatted timestamps. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.from_timestamps( - input.view(), - format, + timestamps.view(), + c_format, input_strings_names.view() ) @@ -44,13 +90,33 @@ cpdef Column from_timestamps( cpdef Column is_timestamp( Column input, - const string& format + str format ): + """ + Verifies the given strings column can be parsed to timestamps + using the provided format pattern. + + For details, see cpp:`cudf::strings::is_timestamp`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New bool column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - format + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd index ac11b8959ed..eecdade4ef9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=* ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index f3e0b7c9c8e..76c5809c3d5 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -15,27 +15,80 @@ from pylibcudf.types import DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ): + """ + Returns a new duration column converting a strings column into + durations using the provided format pattern. + + For details, see cpp:func:`cudf::strings::to_durations` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + duration_type : DataType + The duration type used for creating the output column. + + format : str + String specifying the duration format in strings. + + Returns + ------- + Column + New duration column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() + with nogil: - c_result = cpp_convert_durations.to_durations( - input.view(), - duration_type.c_obj, - format + c_result = move( + cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + c_format + ) ) return Column.from_libcudf(move(c_result)) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=None ): + """ + Returns a new strings column converting a duration column into + strings using the provided format pattern. + + For details, see cpp:func:`cudf::strings::from_durations` + + Parameters + ---------- + durations : Column + Duration values to convert. + + format : str + The string specifying output format. + Default format is "%D days %H:%M:%S". + + Returns + ------- + Column + New strings column with formatted durations. + """ cdef unique_ptr[column] c_result + + if format is None: + format = "%D days %H:%M:%S" + cdef string c_format = format.encode() + with nogil: - c_result = cpp_convert_durations.from_durations( - input.view(), - format + c_result = move( + cpp_convert_durations.from_durations( + durations.view(), + c_format + ) ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd new file mode 100644 index 00000000000..049b9b3fffe --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_fixed_point(Column input, DataType output_type) + +cpdef Column from_fixed_point(Column input) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx new file mode 100644 index 00000000000..40dadf6f967 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_fixed_point as cpp_fixed_point, +) +from pylibcudf.types cimport DataType, type_id + + +cpdef Column to_fixed_point(Column input, DataType output_type): + """ + Returns a new fixed-point column parsing decimal values from the + provided strings column. + + For details, see :cpp:details:`cudf::strings::to_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of fixed-point column to return including the scale value. + + Returns + ------- + Column + New column of output_type. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.to_fixed_point( + input.view(), + output_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_fixed_point(Column input): + """ + Returns a new strings column converting the fixed-point values + into a strings column. + + For details, see :cpp:details:`cudf::strings::from_fixed_point` + + Parameters + ---------- + input : Column + Fixed-point column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.from_fixed_point( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to fixed-point. + + For details, see :cpp:details:`cudf::strings::is_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + decimal_type : DataType + Fixed-point type (with scale) used only for checking overflow. + Defaults to Decimal64 + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if decimal_type is None: + decimal_type = DataType(type_id.DECIMAL64) + + with nogil: + c_result = move( + cpp_fixed_point.is_fixed_point( + input.view(), + decimal_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd new file mode 100644 index 00000000000..c61f5c0bdca --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column ipv4_to_integers(Column input) + +cpdef Column integers_to_ipv4(Column integers) + +cpdef Column is_ipv4(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx new file mode 100644 index 00000000000..f2a980d4269 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 + + +cpdef Column ipv4_to_integers(Column input): + """ + Converts IPv4 addresses into integers. + + For details, see cpp:func:`cudf::strings::ipv4_to_integers` + + Parameters + ---------- + input : Column + Strings instance for this operation + + Returns + ------- + Column + New uint32 column converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.ipv4_to_integers( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_ipv4(Column integers): + """ + Converts integers into IPv4 addresses as strings. + + For details, see cpp:func:`cudf::strings::integers_to_ipv4` + + Parameters + ---------- + integers : Column + Integer (uint32) column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.integers_to_ipv4( + integers.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_ipv4(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from IPv4 format. + + For details, see cpp:func:`cudf::strings::is_ipv4` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.is_ipv4( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd new file mode 100644 index 00000000000..b7b3aefa336 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column find_multiple(Column input, Column targets) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx new file mode 100644 index 00000000000..413fc1cb79d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple + + +cpdef Column find_multiple(Column input, Column targets): + """ + Returns a lists column with character position values where each + of the target strings are found in each string. + + For details, see :cpp:func:`cudf::strings::find_multiple`. + + Parameters + ---------- + input : Column + Strings instance for this operation + targets : Column + Strings to search for in each string + + Returns + ------- + Column + Lists column with character position values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_find_multiple.find_multiple( + input.view(), + targets.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd index 54afa088141..3c35a9c9aa9 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -4,4 +4,5 @@ from pylibcudf.column cimport Column from pylibcudf.strings.regex_program cimport RegexProgram +cpdef Column find_re(Column input, RegexProgram pattern) cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 3a6b87504b3..5212dc4594d 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -38,3 +38,35 @@ cpdef Column findall(Column input, RegexProgram pattern): ) return Column.from_libcudf(move(c_result)) + + +cpdef Column find_re(Column input, RegexProgram pattern): + """ + Returns character positions where the pattern first matches + the elements in input strings. + + For details, see :cpp:func:`cudf::strings::find_re` + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New column of integers + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_findall.find_re( + input.view(), + pattern.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/strings/padding.pxd new file mode 100644 index 00000000000..a035a5ad187 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.side_type cimport side_type +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char) + +cpdef Column zfill(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx new file mode 100644 index 00000000000..24daaaa3838 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -0,0 +1,75 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport padding as cpp_padding +from pylibcudf.libcudf.strings.side_type cimport side_type + + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char): + """ + Add padding to each string using a provided character. + + For details, see :cpp:func:`cudf::strings::pad`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + side : SideType + Where to place the padding characters. + fill_char : str + Single UTF-8 character to use for padding + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef string c_fill_char = fill_char.encode("utf-8") + + with nogil: + c_result = move( + cpp_padding.pad( + input.view(), + width, + side, + c_fill_char, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column zfill(Column input, size_type width): + """ + Add '0' as padding to the left of each string. + + For details, see :cpp:func:`cudf::strings::zfill`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + + Returns + ------- + Column + New column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_padding.zfill( + input.view(), + width, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd index 34b7a580380..34b03e9bc27 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -1,3 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index acdc7d6ff1f..cf0c770cc11 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,4 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt new file mode 100644 index 00000000000..8f544f6f537 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources partition.pyx split.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd similarity index 56% rename from python/custreamz/custreamz/tests/pytest.ini rename to python/pylibcudf/pylibcudf/strings/split/__init__.pxd index 7b0a9f29fb1..72086e57d9f 100644 --- a/python/custreamz/custreamz/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd @@ -1,4 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native +from . cimport partition, split diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/strings/split/__init__.py similarity index 56% rename from python/dask_cudf/dask_cudf/tests/pytest.ini rename to python/pylibcudf/pylibcudf/strings/split/__init__.py index 7b0a9f29fb1..2033e5e275b 100644 --- a/python/dask_cudf/dask_cudf/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -1,4 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native +from . import partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd new file mode 100644 index 00000000000..c18257a4787 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + + +cpdef Table partition(Column input, Scalar delimiter=*) + +cpdef Table rpartition(Column input, Scalar delimiter=*) diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx new file mode 100644 index 00000000000..ecc959e65b0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -0,0 +1,95 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.split cimport partition as cpp_partition +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table partition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::partition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = move( + cpp_partition.partition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rpartition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rpartition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = move( + cpp_partition.rpartition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd new file mode 100644 index 00000000000..355a1874298 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx new file mode 100644 index 00000000000..a7d7f39fc47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -0,0 +1,326 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.split cimport split as cpp_split +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::split`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns. + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings. + + For details, see :cpp:func:`cudf::strings::split_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings starting + from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.split_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string starting from + the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.split_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string starting from the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/strings/translate.pxd new file mode 100644 index 00000000000..0ca746801d7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.translate cimport filter_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column translate(Column input, dict chars_table) + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx new file mode 100644 index 00000000000..a62c7ec4528 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport translate as cpp_translate +from pylibcudf.libcudf.types cimport char_utf8 +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference +from pylibcudf.libcudf.strings.translate import \ + filter_type as FilterType # no-cython-lint + + +cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table): + """ + Convert str.maketrans table to cudf compatible table. + """ + cdef int table_size = len(table) + cdef vector[pair[char_utf8, char_utf8]] c_table + + c_table.reserve(table_size) + for key, value in table.items(): + if isinstance(value, int): + value = chr(value) + if isinstance(value, str): + value = int.from_bytes(value.encode(), byteorder='big') + if isinstance(key, int): + key = chr(key) + if isinstance(key, str): + key = int.from_bytes(key.encode(), byteorder='big') + c_table.push_back((key, value)) + + return c_table + + +cpdef Column translate(Column input, dict chars_table): + """ + Translates individual characters within each string. + + For details, see :cpp:func:`cudf::strings::translate`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + chars_table : dict + Table of UTF-8 character mappings + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table( + chars_table + ) + + with nogil: + c_result = move( + cpp_translate.translate( + input.view(), + c_chars_table + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +): + """ + Removes ranges of characters from each string in a strings column. + + For details, see :cpp:func:`cudf::strings::filter_characters`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + characters_to_filter : dict + Table of character ranges to filter on + + keep_characters : FilterType + If true, the `characters_to_filter` are retained + and all other characters are removed. + + replacement : Scalar + Replacement string for each character removed. + + Returns + ------- + Column + New column with filtered strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_characters_to_filter = _table_to_c_table( + characters_to_filter + ) + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_translate.filter_characters( + input.view(), + c_characters_to_filter, + keep_characters, + dereference(c_replacement), + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/strings/wrap.pxd new file mode 100644 index 00000000000..fcc86650acf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column wrap(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx new file mode 100644 index 00000000000..11e31f54eee --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport wrap as cpp_wrap +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column wrap(Column input, size_type width): + """ + Wraps strings onto multiple lines shorter than `width` by + replacing appropriate white space with + new-line characters (ASCII 0x0A). + + For details, see :cpp:func:`cudf::strings::wrap`. + + Parameters + ---------- + input : Column + String column + + width : int + Maximum character width of a line within each string + + Returns + ------- + Column + Column of wrapped strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_wrap.wrap( + input.view(), + width, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini deleted file mode 100644 index f572f85ca49..00000000000 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* -addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index 89c96829e71..75930d59058 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,7 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import datetime -import functools import pyarrow as pa import pyarrow.compute as pc @@ -10,19 +9,6 @@ from utils import assert_column_eq -@pytest.fixture -def date_column(has_nulls): - values = [ - datetime.date(1999, 1, 1), - datetime.date(2024, 10, 12), - datetime.date(1, 1, 1), - datetime.date(9999, 1, 1), - ] - if has_nulls: - values[2] = None - return plc.interop.from_arrow(pa.array(values, type=pa.date32())) - - @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) def datetime_column(has_nulls, request): values = [ @@ -40,24 +26,35 @@ def datetime_column(has_nulls, request): ) -@pytest.mark.parametrize( - "component, pc_fun", - [ - ("year", pc.year), - ("month", pc.month), - ("day", pc.day), - ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), - ("hour", pc.hour), - ("minute", pc.minute), - ("second", pc.second), - ("millisecond", pc.millisecond), - ("microsecond", pc.microsecond), - ("nanosecond", pc.nanosecond), +@pytest.fixture( + params=[ + ("year", plc.datetime.DatetimeComponent.YEAR), + ("month", plc.datetime.DatetimeComponent.MONTH), + ("day", plc.datetime.DatetimeComponent.DAY), + ("day_of_week", plc.datetime.DatetimeComponent.WEEKDAY), + ("hour", plc.datetime.DatetimeComponent.HOUR), + ("minute", plc.datetime.DatetimeComponent.MINUTE), + ("second", plc.datetime.DatetimeComponent.SECOND), + ("millisecond", plc.datetime.DatetimeComponent.MILLISECOND), + ("microsecond", plc.datetime.DatetimeComponent.MICROSECOND), + ("nanosecond", plc.datetime.DatetimeComponent.NANOSECOND), ], + ids=lambda x: x[0], ) -def test_extraction(datetime_column, component, pc_fun): +def component(request): + return request.param + + +def test_extract_datetime_component(datetime_column, component): + attr, component = component + kwargs = {} + if attr == "day_of_week": + kwargs = {"count_from_zero": False} got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) + + expect = getattr(pc, attr)( + plc.interop.to_arrow(datetime_column), **kwargs + ).cast(pa.int16()) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py new file mode 100644 index 00000000000..7d93c471cc4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def edit_distance_data(): + arr1 = ["hallo", "goodbye", "world"] + arr2 = ["hello", "", "world"] + return pa.array(arr1), pa.array(arr2) + + +def test_edit_distance(edit_distance_data): + input_col, targets = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + ) + expected = pa.array([1, 7, 0], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_edit_distance_matrix(edit_distance_data): + input_col, _ = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance_matrix( + plc.interop.from_arrow(input_col) + ) + expected = pa.array( + [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32()) + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py new file mode 100644 index 00000000000..5cf9874d595 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["ab", "cde", "fgh"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngram", [2, 3]) +@pytest.mark.parametrize("sep", ["_", "**", ","]) +def test_generate_ngrams(input_col, ngram, sep): + result = plc.nvtext.generate_ngrams.generate_ngrams( + plc.interop.from_arrow(input_col), + ngram, + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"]) + if ngram == 3: + expected = pa.array([f"ab{sep}cde{sep}fgh"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_generate_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]]) + if ngram == 3: + expected = pa.array([[], ["cde"], ["fgh"]]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_hash_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + pa_result = plc.interop.to_arrow(result) + assert all( + len(got) == max(0, len(s.as_py()) - ngram + 1) + for got, s in zip(pa_result, input_col) + ) + assert pa_result.type == pa.list_( + pa.field("element", pa.uint32(), nullable=False) + ) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py new file mode 100644 index 00000000000..bcd030c019e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +from utils import assert_column_eq + + +def test_all_characters_of_type(): + pa_array = pa.array(["1", "A"]) + result = plc.strings.char_types.all_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHA, + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.utf8_is_alpha(pa_array) + assert_column_eq(result, expected) + + +def test_filter_characters_of_type(): + pa_array = pa.array(["=A="]) + result = plc.strings.char_types.filter_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHANUM, + plc.interop.from_arrow(pa.scalar(" ")), + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.replace_substring(pa_array, "A", " ") + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index e9e95459d0e..69f7a0fdd33 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -1,7 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from datetime import datetime - import pyarrow as pa import pylibcudf as plc import pytest @@ -21,39 +19,16 @@ def timestamp_type(request): return request.param -@pytest.fixture( - scope="module", - params=[ - pa.duration("ns"), - pa.duration("us"), - pa.duration("ms"), - pa.duration("s"), - ], -) -def duration_type(request): - return request.param - - @pytest.fixture(scope="module") def pa_timestamp_col(): return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) -@pytest.fixture(scope="module") -def pa_duration_col(): - return pa.array(["05:20:25"]) - - @pytest.fixture(scope="module") def plc_timestamp_col(pa_timestamp_col): return plc.interop.from_arrow(pa_timestamp_col) -@pytest.fixture(scope="module") -def plc_duration_col(pa_duration_col): - return plc.interop.from_arrow(pa_duration_col) - - @pytest.mark.parametrize("format", ["%Y-%m-%d"]) def test_to_datetime( pa_timestamp_col, plc_timestamp_col, timestamp_type, format @@ -62,24 +37,6 @@ def test_to_datetime( got = plc.strings.convert.convert_datetime.to_timestamps( plc_timestamp_col, plc.interop.from_arrow(timestamp_type), - format.encode(), - ) - assert_column_eq(expect, got) - - -@pytest.mark.parametrize("format", ["%H:%M:%S"]) -def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): - def to_timedelta(duration_str): - date = datetime.strptime(duration_str, format) - return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date - - expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( - duration_type - ) - - got = plc.strings.convert.convert_durations.to_durations( - plc_duration_col, - plc.interop.from_arrow(duration_type), - format.encode(), + format, ) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py new file mode 100644 index 00000000000..117c59ff1b8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_booleans(): + pa_array = pa.array(["true", None, "True"]) + result = plc.strings.convert.convert_booleans.to_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("True")), + ) + expected = pa.array([False, None, True]) + assert_column_eq(result, expected) + + +def test_from_booleans(): + pa_array = pa.array([True, None, False]) + result = plc.strings.convert.convert_booleans.from_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("A")), + plc.interop.from_arrow(pa.scalar("B")), + ) + expected = pa.array(["A", None, "B"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py new file mode 100644 index 00000000000..f3e84286a36 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import datetime + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def fmt(): + return "%Y-%m-%dT%H:%M:%S" + + +def test_to_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None]) + result = plc.strings.convert.convert_datetime.to_timestamps( + plc.interop.from_arrow(arr), + plc.DataType(plc.TypeId.TIMESTAMP_SECONDS), + fmt, + ) + expected = pc.strptime(arr, fmt, "s") + assert_column_eq(result, expected) + + +def test_from_timestamp(fmt): + arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None]) + result = plc.strings.convert.convert_datetime.from_timestamps( + plc.interop.from_arrow(arr), + fmt, + plc.interop.from_arrow(pa.array([], type=pa.string())), + ) + # pc.strftime will add the extra %f + expected = pa.array(["2020-01-01T01:01:01", None]) + assert_column_eq(result, expected) + + +def test_is_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"]) + result = plc.strings.convert.convert_datetime.is_timestamp( + plc.interop.from_arrow(arr), + fmt, + ) + expected = pa.array([True, None, False]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py new file mode 100644 index 00000000000..6d704309bfd --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime, timedelta + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture( + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +def test_to_duration(pa_duration_col, plc_duration_col, duration_type): + format = "%H:%M:%S" + + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format, + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"]) +def test_from_durations(format): + pa_array = pa.array( + [timedelta(days=1, hours=1, minutes=1, seconds=1), None] + ) + result = plc.strings.convert.convert_durations.from_durations( + plc.interop.from_arrow(pa_array), format + ) + expected = pa.array(["1 days 01:01:01", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py new file mode 100644 index 00000000000..b1c4d729604 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import decimal + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_fixed_point(): + typ = pa.decimal128(38, 2) + arr = pa.array(["123", "1.23", None]) + result = plc.strings.convert.convert_fixed_point.to_fixed_point( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_fixed_point(): + arr = pa.array([decimal.Decimal("1.1"), None]) + result = plc.strings.convert.convert_fixed_point.from_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["1.1", None]) + assert_column_eq(result, expected) + + +def test_is_fixed_point(): + arr = pa.array(["123", "1.23", "1.2.3", "", None]) + result = plc.strings.convert.convert_fixed_point.is_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py new file mode 100644 index 00000000000..4dc3e512624 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_ipv4_to_integers(): + arr = pa.array(["123.45.67.890", None]) + result = plc.strings.convert.convert_ipv4.ipv4_to_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array([2066564730, None], type=pa.uint32()) + assert_column_eq(result, expected) + + +def test_integers_to_ipv4(): + arr = pa.array([1, 0, None], type=pa.uint32()) + result = plc.strings.convert.convert_ipv4.integers_to_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["0.0.0.1", "0.0.0.0", None]) + assert_column_eq(result, expected) + + +def test_is_ipv4(): + arr = pa.array(["0.0.0.1", "1.2.34", "A", None]) + result = plc.strings.convert.convert_ipv4.is_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py new file mode 100644 index 00000000000..d6b37a388f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_find_multiple(): + arr = pa.array(["abc", "def"]) + targets = pa.array(["a", "c", "e"]) + result = plc.strings.find_multiple.find_multiple( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(targets), + ) + expected = pa.array( + [ + [elem.find(target) for target in targets.to_pylist()] + for elem in arr.to_pylist() + ], + type=pa.list_(pa.int32()), + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py index 994552fa276..debfad92d00 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -21,3 +21,20 @@ def test_findall(): type=pa_result.type, ) assert_column_eq(result, expected) + + +def test_find_re(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[eb]" + result = plc.strings.findall.find_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [0, 2, 3, -1], + type=pa_result.type, + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py new file mode 100644 index 00000000000..2ba775d17ae --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_pad(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.pad( + plc.interop.from_arrow(arr), + 2, + plc.strings.side_type.SideType.LEFT, + "!", + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="!")) + assert result.equals(expected) + + +def test_zfill(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.zfill(plc.interop.from_arrow(arr), 2) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="0")) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py new file mode 100644 index 00000000000..80cae8d1c6b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_table_eq + + +@pytest.fixture +def data_col(): + pa_arr = pa.array(["ab_cd", "def_g_h", None]) + plc_column = plc.interop.from_arrow(pa_arr) + return pa_arr, plc_column + + +def test_partition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.partition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def", None], + "b": ["_", "_", None], + "c": ["cd", "g_h", None], + } + ) + assert_table_eq(expected, result) + + +def test_rpartition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.rpartition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def_g", None], + "b": ["_", "_", None], + "c": ["cd", "h", None], + } + ) + assert_table_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py new file mode 100644 index 00000000000..2aeffac8209 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -0,0 +1,130 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq, assert_table_eq + + +@pytest.fixture +def data_col(): + pa_array = pa.array(["a_b_c", "d-e-f", None]) + plc_column = plc.interop.from_arrow(pa_array) + return pa_array, plc_column + + +@pytest.fixture +def delimiter(): + delimiter = "_" + plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter)) + return delimiter, plc_delimiter + + +@pytest.fixture +def re_delimiter(): + return "[_-]" + + +def test_split(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.split(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a", "d-e-f", None], + "b": ["b_c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a_b", "d-e-f", None], + "b": ["c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_split_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.split_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a", "d", None], + "b": ["b_c", "e-f", None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.rsplit_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a_b", "d-e", None], + "b": ["c", "f", None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.split_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.rsplit_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + -1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py new file mode 100644 index 00000000000..2ae893e69fb --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def data_col(): + pa_data_col = pa.array( + ["aa", "bbb", "cccc", "abcd", None], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture +def trans_table(): + return str.maketrans("abd", "A Q") + + +def test_translate(data_col, trans_table): + pa_array, plc_col = data_col + result = plc.strings.translate.translate(plc_col, trans_table) + expected = pa.array( + [ + val.translate(trans_table) if isinstance(val, str) else None + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result) + + +@pytest.mark.parametrize( + "keep", + [ + plc.strings.translate.FilterType.KEEP, + plc.strings.translate.FilterType.REMOVE, + ], +) +def test_filter_characters(data_col, trans_table, keep): + pa_array, plc_col = data_col + result = plc.strings.translate.filter_characters( + plc_col, trans_table, keep, plc.interop.from_arrow(pa.scalar("*")) + ) + exp_data = [] + flat_trans = set(trans_table.keys()).union(trans_table.values()) + for val in pa_array.to_pylist(): + if not isinstance(val, str): + exp_data.append(val) + else: + new_val = "" + for ch in val: + if ( + ch in flat_trans + and keep == plc.strings.translate.FilterType.KEEP + ): + new_val += ch + elif ( + ch not in flat_trans + and keep == plc.strings.translate.FilterType.REMOVE + ): + new_val += ch + else: + new_val += "*" + exp_data.append(new_val) + expected = pa.array(exp_data) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py new file mode 100644 index 00000000000..a1c820cd586 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import textwrap + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_wrap(): + width = 12 + pa_array = pa.array( + [ + "the quick brown fox jumped over the lazy brown dog", + "hello, world", + None, + ] + ) + result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width) + expected = pa.array( + [ + textwrap.fill(val, width) if isinstance(val, str) else val + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index a8224f54e1c..be65142850f 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -95,6 +95,16 @@ skip = [ "__init__.py", ] +[tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*" +] +xfail_strict = true + [tool.rapids-build-backend] build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml"