diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index af8d1289ea1..6f0e88fb245 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -62,3 +62,33 @@ jobs: UPDATE_ITEM: true UPDATE_LINKED_ISSUES: true secrets: inherit + + process-branch-name: + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: get-project-id + runs-on: ubuntu-latest + outputs: + branch-name: ${{ steps.process-branch-name.outputs.branch-name }} + steps: + - name: Extract branch name + id: process-branch-name + run: | + branch=${{ github.event.pull_request.base.ref }} + release=${branch#branch-} + echo "branch-name=$release" >> "$GITHUB_OUTPUT" + + update-release: + # This job sets the PR and its linked issues to the release they are targeting + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12 + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: [get-project-id, process-branch-name] + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ" + SINGLE_SELECT_FIELD_NAME: "Release" + SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 55399d0371a..f5bcdc62604 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -24,14 +24,17 @@ rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -# Download the pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep +# Download libcudf and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep -rapids-logger "Install pylibcudf" -python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl +rapids-logger "Install libcudf, pylibcudf and cudf_polars" +python -m pip install \ + -v \ + "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ + "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" -rapids-logger "Install cudf_polars" -python -m pip install $(echo ./dist/cudf_polars*.whl) TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') rapids-logger "Clone polars to ${TAG}" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index 2386414b32e..9528549a562 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi - +rapids-print-env EXITCODE=0 trap "EXITCODE=1" ERR set +e diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8b45d26c367..bd5e6c3d569 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -63,7 +63,7 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc - polars>=1.8,<1.9 - pre-commit diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 354c1360e5a..565a3ebfa3c 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -61,7 +61,7 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc - polars>=1.8,<1.9 - pre-commit diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 25e69b89789..2c254415318 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,7 +78,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - numba-cuda >=0.0.13 - numpy >=1.23,<3.0a0 diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 7c1efa0176c..3d965f30986 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -77,7 +77,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index b791d846d1d..2d4f8c0d80e 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -1,18 +1,47 @@ --- +# Notes on disabled checks +# ------------------------ +# modernize-use-equals-default: +# auto-fix is broken (doesn't insert =default correctly) +# modernize-concat-nested-namespaces: +# auto-fix is broken (can delete code) +# modernize-use-trailing-return-type: +# Purely stylistic, no benefit to rewriting everything +# modernize-return-braced-init-list: +# Stylistically we prefer to see the return type at the return site. +# See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672 +# for more information. +# modernize-use-bool-literals: +# Our tests use int flags for validity masks extensively and we prefer that +# clang-analyzer-cplusplus.NewDeleteLeaks: +# This check has numerous bugs, see +# https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks +# We encounter at least +# https://github.com/llvm/llvm-project/issues/60896 +# https://github.com/llvm/llvm-project/issues/69602 +# clang-analyzer-optin.core.EnumCastOutOfRange +# We use enums as flags in multiple cases and this check makes ORing flags invalid +# clang-analyzer-optin.cplusplus.UninitializedObject' +# There is an error in nanoarrow that none of the clang-tidy filters (i.e. +# header-filter and exclude-header-filter are able to properly avoid. This +# merits further investigation +# +# We need to verify that broken checks are still broken Checks: 'modernize-*, -modernize-use-equals-default, -modernize-concat-nested-namespaces, -modernize-use-trailing-return-type, - -modernize-use-bool-literals' - - # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) - # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) - # -modernize-use-trailing-return-type # just a preference + -modernize-return-braced-init-list, + -modernize-use-bool-literals, + clang-analyzer-*, + -clang-analyzer-cplusplus.NewDeleteLeaks, + -clang-analyzer-optin.core.EnumCastOutOfRange, + -clang-analyzer-optin.cplusplus.UninitializedObject' WarningsAsErrors: '' -HeaderFilterRegex: '' -AnalyzeTemporaryDtors: false +HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*' +ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none CheckOptions: - key: modernize-loop-convert.MaxCopySize diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 136f43ee706..f7a5dd2f2fb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) +option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) option( diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..b8a53cd8bd9 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- @@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) -# ################################################################################################## -# * multi buffer memset benchmark -# ---------------------------------------------------------------------- -ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp) - # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 65a44532cf1..f44f26e4d2c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,16 @@ */ #include -#include -#include #include #include +#include + #include +#include + #include #include #include @@ -35,13 +37,10 @@ enum class TreeType { }; template -class AST : public cudf::benchmark {}; - -template -static void BM_ast_transform(benchmark::State& state) +static void BM_ast_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size = static_cast(state.get_int64("table_size")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state) auto const& expression_tree_root = expressions.back(); - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::compute_column(table, expression_tree_root); - } - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); -} + state.add_global_memory_reads(table_size * (tree_levels + 1)); -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) \ + static void name(::nvbench::state& st) \ { \ - BM_ast_transform(st); \ + ::BM_ast_transform(st); \ } \ - BENCHMARK_REGISTER_F(AST, name) \ - ->Apply(CustomRanges) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("tree_levels", {1, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index fa98d9e601a..7d267a88764 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,14 @@ */ #include -#include -#include #include #include #include +#include + #include -#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -33,13 +32,10 @@ enum class TreeType { }; template -class BINARYOP : public cudf::benchmark {}; - -template -static void BM_binaryop_transform(benchmark::State& state) +static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size{static_cast(state.get_int64("table_size"))}; + auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state) cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); cudf::table_view table{*source_table}; - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + // Use the number of bytes read from global memory + state.add_global_memory_reads(table_size * (tree_levels + 1)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) auto const op = cudf::binary_operator::ADD; auto const result_data_type = cudf::data_type(cudf::type_to_id()); @@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state) result = cudf::binary_operation(result->view(), col, op, result_data_type); }); } - } - - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); + }); } #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \ - (::benchmark::State & st) { BM_binaryop_transform(st); } + \ + static void name(::nvbench::state& st) \ + { \ + BM_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false); - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 2, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 7086a61c7c5..bc0ff69bce9 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -15,20 +15,18 @@ */ #include -#include -#include #include -class COMPILED_BINARYOP : public cudf::benchmark {}; +#include template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const column_size{static_cast(state.range(0))}; + auto const table_size = static_cast(state.get_int64("table_size")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{column_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{table_size}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) // Call once for hot cache. cudf::binary_operation(lhs, rhs, binop, output_dtype); - for (auto _ : state) { - cuda_event_timer timer(state, true); - cudf::binary_operation(lhs, rhs, binop, output_dtype); - } - // use number of bytes read and written to global memory - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut))); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); } +#define BM_STRINGIFY(a) #a + // TODO tparam boolean for null. -#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ - BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::bop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ - ->Arg(100000000); /* 100M */ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_compiled_binaryop(st, ::cudf::binary_operator::bop); \ + } \ + NVBENCH_BENCH(name) \ + .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ + .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp deleted file mode 100644 index 2905895a63b..00000000000 --- a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include - -#include - -// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to -// run on most GPUs, but large enough to allow highest throughput -constexpr size_t data_size = 512 << 20; - -void parquet_read_common(cudf::size_type num_rows_to_read, - cudf::size_type num_cols_to_read, - cuio_source_sink_pair& source_sink, - nvbench::state& state) -{ - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); - - auto mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); - - timer.start(); - auto const result = cudf::io::read_parquet(read_opts); - timer.stop(); - - CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns"); - CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); - }); - - auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); - state.add_buffer_size( - mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); - state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); -} - -template -void bench_batched_memset(nvbench::state& state, nvbench::type_list>) -{ - auto const d_type = get_type_or_group(static_cast(DataType)); - auto const num_cols = static_cast(state.get_int64("num_cols")); - auto const cardinality = static_cast(state.get_int64("cardinality")); - auto const run_length = static_cast(state.get_int64("run_length")); - auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); - auto const compression = cudf::io::compression_type::NONE; - cuio_source_sink_pair source_sink(source_type); - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - cudf::io::write_parquet(write_opts); - auto const num_rows = view.num_rows(); - - parquet_read_common(num_rows, num_cols, source_sink, state); -} - -using d_type_list = nvbench::enum_type_list; - -NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list)) - .set_name("batched_memset") - .set_type_axes_names({"data_type"}) - .add_int64_axis("num_cols", {1000}) - .add_string_axis("io_type", {"DEVICE_BUFFER"}) - .set_min_samples(4) - .add_int64_axis("cardinality", {0, 1000}) - .add_int64_axis("run_length", {1, 32}); diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 8df1b431095..d7d7fcca044 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,15 +14,17 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) + include(${rapids-cmake-dir}/cpm/package_override.cmake) + + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") + # Currently we need to always build nanoarrow so we don't pickup a previous installed version set(CPM_DOWNLOAD_nanoarrow ON) rapids_cpm_find( nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS - GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git - GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb - GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 1b6a1730161..33b1b45fb44 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -16,7 +16,11 @@ function(find_and_configure_nvcomp) include(${rapids-cmake-dir}/cpm/nvcomp.cmake) - rapids_cpm_nvcomp(USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) + set(export_args) + if(CUDF_EXPORT_NVCOMP) + set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + endif() + rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) # Per-thread default stream if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff new file mode 100644 index 00000000000..e9a36fcb567 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff @@ -0,0 +1,38 @@ +diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h +index caa6be4..70ec8a2 100644 +--- a/src/nanoarrow/common/inline_buffer.h ++++ b/src/nanoarrow/common/inline_buffer.h +@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + } + + static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { +- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ++ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); +- bits[bytes_begin] &= only_byte_mask; ++ bits[bytes_begin] &= only_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte +- bits[bytes_begin] &= first_byte_mask; ++ bits[bytes_begin] &= first_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { +@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte +- *out_cursor = 0x00; ++ *out_cursor = 0x00; // NOLINT + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json new file mode 100644 index 00000000000..d529787e7c8 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json @@ -0,0 +1,18 @@ + +{ + "packages" : { + "nanoarrow" : { + "version" : "0.6.0.dev", + "git_url" : "https://github.com/apache/arrow-nanoarrow.git", + "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", + "git_shallow" : false, + "patches" : [ + { + "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", + "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 6d1c91a5752..6902b1948bd 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -8,6 +8,7 @@ This page specifies which regular expression (regex) features are currently supp - cudf::strings::extract() - cudf::strings::extract_all_record() - cudf::strings::findall() +- cudf::strings::find_re() - cudf::strings::replace_re() - cudf::strings::replace_with_backrefs() - cudf::strings::split_re() diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 7359a0d5fde..1eaea5b6374 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -38,6 +38,22 @@ namespace datetime { * @file */ +/** + * @brief Types of datetime components that may be extracted. + */ +enum class datetime_component : uint8_t { + YEAR, + MONTH, + DAY, + WEEKDAY, + HOUR, + MINUTE, + SECOND, + MILLISECOND, + MICROSECOND, + NANOSECOND +}; + /** * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. @@ -207,6 +223,24 @@ std::unique_ptr extract_nanosecond_fraction( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Extracts the specified datetime component from any datetime type and + * returns an int16_t cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param component The datetime component to extract + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t datetime component + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_datetime_component( + cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group /** * @addtogroup datetime_compute diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 9db7e48498f..df3050d6494 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -115,6 +115,16 @@ std::unique_ptr extract_nanosecond_fraction(cudf::column_view cons rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @copydoc cudf::extract_datetime_component(cudf::column_view const&, datetime_component, + * rmm::cuda_stream_view, rmm::device_async_resource_ref) + * + */ +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp new file mode 100644 index 00000000000..ed0ab9e6e5b --- /dev/null +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief A helper function that copies a vector of vectors from source to destination addresses in + * a batched manner. + * + * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator + * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator + * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator + * + * @param src_iter Device-accessible iterator to source addresses + * @param dst_iter Device-accessible iterator to destination addresses + * @param size_iter Device-accessible iterator to the buffer sizes (in bytes) + * @param num_buffs Number of buffers to be copied + * @param stream CUDA stream to use + */ +template +void batched_memcpy_async(SrcIterator src_iter, + DstIterator dst_iter, + SizeIterator size_iter, + size_t num_buffs, + rmm::cuda_stream_view stream) +{ + size_t temp_storage_bytes = 0; + cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value()); + + rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; + + cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_iter, + dst_iter, + size_iter, + num_buffs, + stream.value()); +} + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp similarity index 98% rename from cpp/include/cudf/io/detail/batched_memset.hpp rename to cpp/include/cudf/detail/utilities/batched_memset.hpp index 1c74be4a9fe..75f738f7529 100644 --- a/cpp/include/cudf/io/detail/batched_memset.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp @@ -28,7 +28,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::detail { +namespace detail { /** * @brief A helper function that takes in a vector of device spans and memsets them to the @@ -78,5 +78,5 @@ void batched_memset(std::vector> const& bufs, d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream); } -} // namespace io::detail +} // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp index 8c1c3c28df8..e7643eb44bd 100644 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ b/cpp/include/cudf/detail/utilities/logger.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ #include // Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__) +#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index b12fbe39a57..dc14802adc1 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -86,14 +86,28 @@ class datasource { /** * @brief Creates a source from a file path. * + * @note Parameters `offset`, `max_size_estimate` and `min_size_estimate` are hints to the + * `datasource` implementation about the expected range of the data that will be read. The + * implementation may use these hints to optimize the read operation. These parameters are usually + * based on the byte range option. In this case, `min_size_estimate` should be no greater than the + * byte range to avoid potential issues when reading adjacent ranges. `max_size_estimate` can + * include padding after the byte range, to include additional data that may be needed for + * processing. + * + @throws cudf::logic_error if the minimum size estimate is greater than the maximum size estimate + * * @param[in] filepath Path to the file to use - * @param[in] offset Bytes from the start of the file (the default is zero) - * @param[in] size Bytes from the offset; use zero for entire file (the default is zero) + * @param[in] offset Starting byte offset from which data will be read (the default is zero) + * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) + * @param[in] min_size_estimate Lower estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) * @return Constructed datasource object */ static std::unique_ptr create(std::string const& filepath, - size_t offset = 0, - size_t size = 0); + size_t offset = 0, + size_t max_size_estimate = 0, + size_t min_size_estimate = 0); /** * @brief Creates a source from a host memory buffer. diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index c6b9bc7e58a..867764b6d9a 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -66,6 +66,35 @@ std::unique_ptr findall( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the starting character index of the first match for the given pattern + * in each row of the input column + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * p = regex_program::create("[be]") + * r = find_re(s, p) + * r is now [0, 2, 3, -1] + * @endcode + * + * A null output row occurs if the corresponding input row is null. + * A -1 is returned for rows that do not contain a match. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input Strings instance for this operation + * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of integers + */ +std::unique_ptr find_re( + strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of doxygen group } // namespace strings } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 762131a174f..15fdad21d9f 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -148,7 +148,7 @@ class table { std::vector columns(std::distance(begin, end)); std::transform( begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 4a990f67ce4..d41176590ea 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -241,7 +241,7 @@ class table_view : public detail::table_view_base { { std::vector columns(std::distance(begin, end)); std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp index 45d5d1b12e1..982554a23f5 100644 --- a/cpp/include/cudf/utilities/logger.hpp +++ b/cpp/include/cudf/utilities/logger.hpp @@ -22,6 +22,10 @@ namespace CUDF_EXPORT cudf { +namespace detail { +spdlog::logger& logger(); +} + /** * @brief Returns the global logger. * @@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf { * * @return spdlog::logger& The logger. */ -spdlog::logger& logger(); +[[deprecated( + "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& +logger(); } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp index 723ba310a1e..dca590baebf 100644 --- a/cpp/include/nvtext/edit_distance.hpp +++ b/cpp/include/nvtext/edit_distance.hpp @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext { * @param targets Strings to compute edit distance against `input` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New lists column of edit distance values */ std::unique_ptr edit_distance( cudf::strings_column_view const& input, diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ddb0dbcd96d..a497cedb3bc 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -44,19 +44,6 @@ namespace cudf { namespace datetime { namespace detail { -enum class datetime_component { - INVALID = 0, - YEAR, - MONTH, - DAY, - WEEKDAY, - HOUR, - MINUTE, - SECOND, - MILLISECOND, - MICROSECOND, - NANOSECOND -}; enum class rounding_function { CEIL, ///< Rounds up to the next integer multiple of the provided frequency @@ -453,90 +440,70 @@ std::unique_ptr extract_year(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::YEAR, stream, mr); } std::unique_ptr extract_month(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MONTH, stream, mr); } std::unique_ptr extract_day(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::DAY, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::WEEKDAY, stream, mr); } std::unique_ptr extract_hour(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::HOUR, stream, mr); } std::unique_ptr extract_minute(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MINUTE, stream, mr); } std::unique_ptr extract_second(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::SECOND, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MILLISECOND, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MICROSECOND, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::NANOSECOND, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, @@ -576,6 +543,32 @@ std::unique_ptr extract_quarter(column_view const& column, return apply_datetime_op(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ +#define extract(field) \ + case field: \ + return apply_datetime_op, cudf::type_id::INT16>( \ + column, stream, mr) + + switch (component) { + extract(datetime_component::YEAR); + extract(datetime_component::MONTH); + extract(datetime_component::DAY); + extract(datetime_component::WEEKDAY); + extract(datetime_component::HOUR); + extract(datetime_component::MINUTE); + extract(datetime_component::SECOND); + extract(datetime_component::MILLISECOND); + extract(datetime_component::MICROSECOND); + extract(datetime_component::NANOSECOND); + default: CUDF_FAIL("Unsupported datetime component."); + } +#undef extract +} + } // namespace detail std::unique_ptr ceil_datetimes(column_view const& column, @@ -661,6 +654,15 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_datetime_component(column, component, stream, mr); +} + std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 03cf6d4a0e0..d5caa4720ac 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -16,6 +16,7 @@ #include "avro.hpp" +#include #include #include @@ -302,7 +303,7 @@ bool schema_parser::parse(std::vector& schema, std::string const& // Empty schema if (json_str == "[]") return true; - char depthbuf[MAX_SCHEMA_DEPTH]; + std::array depthbuf; int depth = 0, parent_idx = -1, entry_idx = -1; json_state_e state = state_attrname; std::string str; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index de8eea9e99b..5a060902eb2 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -122,14 +122,16 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { std::vector> make_datasources(source_info const& info, - size_t range_offset = 0, - size_t range_size = 0) + size_t offset = 0, + size_t max_size_estimate = 0, + size_t min_size_estimate = 0) { switch (info.type()) { case io_type::FILEPATH: { auto sources = std::vector>(); for (auto const& filepath : info.filepaths()) { - sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + sources.emplace_back( + cudf::io::datasource::create(filepath, offset, max_size_estimate, min_size_estimate)); } return sources; } @@ -211,7 +213,8 @@ table_with_metadata read_json(json_reader_options options, auto datasources = make_datasources(options.get_source(), options.get_byte_range_offset(), - options.get_byte_range_size_with_padding()); + options.get_byte_range_size_with_padding(), + options.get_byte_range_size()); return json::detail::read_json(datasources, options, stream, mr); } @@ -238,7 +241,8 @@ table_with_metadata read_csv(csv_reader_options options, auto datasources = make_datasources(options.get_source(), options.get_byte_range_offset(), - options.get_byte_range_size_with_padding()); + options.get_byte_range_size_with_padding(), + options.get_byte_range_size()); CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported."); diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index dc7199d7ab1..e1241f8f90c 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -170,6 +170,9 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + if (column_v.is_empty()) { // empty begets empty + return make_empty_column(type_id::STRING); + } auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 790532c9d54..5ab36fdae8e 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -258,7 +258,7 @@ class ProtobufReader { private: template - friend class FunctionSwitchImpl; + friend struct FunctionSwitchImpl; void skip_bytes(size_t bytecnt) { diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 5c70e35fd2e..ed0b6969154 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -20,6 +20,8 @@ #include "orc_gpu.hpp" #include +#include +#include #include #include #include @@ -1087,37 +1089,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) /** * @brief Merge chunked column data into a single contiguous stream * - * @param[in,out] strm_desc StripeStream device array [stripe][stream] - * @param[in,out] streams List of encoder chunk streams [column][rowgroup] + * @param[in] strm_desc StripeStream device array [stripe][stream] + * @param[in] streams List of encoder chunk streams [column][rowgroup] + * @param[out] srcs List of source encoder chunk stream data addresses + * @param[out] dsts List of destination StripeStream data addresses + * @param[out] sizes List of stream sizes in bytes */ // blockDim {compact_streams_block_size,1,1} CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) - gpuCompactOrcDataStreams(device_2dspan strm_desc, - device_2dspan streams) + gpuInitBatchedMemcpy(device_2dspan strm_desc, + device_2dspan streams, + device_span srcs, + device_span dsts, + device_span sizes) { - __shared__ __align__(16) StripeStream ss; - - auto const stripe_id = blockIdx.x; + auto const stripe_id = cudf::detail::grid_1d::global_thread_id(); auto const stream_id = blockIdx.y; - auto const t = threadIdx.x; + if (stripe_id >= strm_desc.size().first) { return; } - if (t == 0) { ss = strm_desc[stripe_id][stream_id]; } - __syncthreads(); + auto const out_id = stream_id * strm_desc.size().first + stripe_id; + StripeStream ss = strm_desc[stripe_id][stream_id]; if (ss.data_ptr == nullptr) { return; } auto const cid = ss.stream_type; auto dst_ptr = ss.data_ptr; for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) { + auto const out_id = stream_id * streams.size().second + group; + srcs[out_id] = streams[ss.column_id][group].data_ptrs[cid]; + dsts[out_id] = dst_ptr; + + // Also update the stream here, data will be copied in a separate kernel + streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; + auto const len = streams[ss.column_id][group].lengths[cid]; - if (len > 0) { - auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid]; - for (uint32_t i = t; i < len; i += blockDim.x) { - dst_ptr[i] = src_ptr[i]; - } - __syncthreads(); - } - if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; } + // len is the size (in bytes) of the current stream. + sizes[out_id] = len; dst_ptr += len; } } @@ -1325,9 +1332,26 @@ void CompactOrcDataStreams(device_2dspan strm_desc, device_2dspan enc_streams, rmm::cuda_stream_view stream) { + auto const num_rowgroups = enc_streams.size().second; + auto const num_streams = strm_desc.size().second; + auto const num_stripes = strm_desc.size().first; + auto const num_chunks = num_rowgroups * num_streams; + auto srcs = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto dsts = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto lengths = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + dim3 dim_block(compact_streams_block_size, 1); - dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); - gpuCompactOrcDataStreams<<>>(strm_desc, enc_streams); + dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), + strm_desc.size().second); + gpuInitBatchedMemcpy<<>>( + strm_desc, enc_streams, srcs, dsts, lengths); + + // Copy streams in a batched manner. + cudf::detail::batched_memcpy_async( + srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream); } std::optional CompressOrcDataStreams( diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index b978799b8bc..312a5243687 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -228,7 +228,8 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { +class parquet_field_string_list : public parquet_field_list { + public: parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { @@ -396,8 +397,9 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list +class parquet_field_binary_list : public parquet_field_list, FieldType::BINARY> { + public: parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e0d50d7ccf9..b3276c81c1f 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -17,6 +17,8 @@ #include "page_data.cuh" #include "page_decode.cuh" +#include + #include #include @@ -466,4 +468,28 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, } } +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream) +{ + // Copy offsets to device and create an iterator + auto d_src_data = cudf::detail::make_device_uvector_async( + offsets, stream, cudf::get_current_device_resource_ref()); + // Iterator for the source (scalar) data + auto src_iter = cudf::detail::make_counting_transform_iterator( + static_cast(0), + cuda::proclaim_return_type( + [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; })); + + // Copy buffer addresses to device and create an iterator + auto d_dst_addrs = cudf::detail::make_device_uvector_async( + buff_addrs, stream, cudf::get_current_device_resource_ref()); + // size_iter is simply a constant iterator of sizeof(size_type) bytes. + auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); + + // Copy offsets to buffers in batched manner. + cudf::detail::batched_memcpy_async( + src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream); +} + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index e631e12119d..a8ba3a969ce 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -797,6 +797,18 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span pages, kernel_error::pointer error_code, rmm::cuda_stream_view stream); +/** + * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a + * batched manner. + * + * @param offsets Host span of final offsets + * @param buff_addrs Host span of corresponding output col buffer end addresses + * @param stream CUDA stream to use + */ +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for reading the string column data stored in the pages * diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 7d817bde7af..1b69ccb7742 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -371,13 +371,15 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } - // for list columns, add the final offset to every offset buffer. - // TODO : make this happen in more efficiently. Maybe use thrust::for_each - // on each buffer. + // For list and string columns, add the final offset to every offset buffer. // Note : the reason we are doing this here instead of in the decode kernel is // that it is difficult/impossible for a given page to know that it is writing the very // last value that should then be followed by a terminator (because rows can span // page boundaries). + std::vector out_buffers; + std::vector final_offsets; + out_buffers.reserve(_input_columns.size()); + final_offsets.reserve(_input_columns.size()); for (size_t idx = 0; idx < _input_columns.size(); idx++) { input_column_info const& input_col = _input_columns[idx]; @@ -393,25 +395,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // the final offset for a list at level N is the size of it's child size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), - &offset, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + (out_buf.size - 1)); + final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column auto const sz = static_cast(col_string_sizes[idx]); if (sz <= strings::detail::get_offset64_threshold()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, - &sz, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); + final_offsets.emplace_back(sz); } } } } + // Write the final offsets for list and string columns in a batched manner + WriteFinalOffsets(final_offsets, out_buffers, _stream); // update null counts in the final column buffers for (size_t idx = 0; idx < subpass.pages.size(); idx++) { diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 3763c2e8e6d..8cab68ea721 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -19,9 +19,9 @@ #include #include +#include #include #include -#include #include #include @@ -1656,9 +1656,9 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num } } - cudf::io::detail::batched_memset(memset_bufs, static_cast(0), _stream); + cudf::detail::batched_memset(memset_bufs, static_cast(0), _stream); // Need to set null mask bufs to all high bits - cudf::io::detail::batched_memset( + cudf::detail::batched_memset( nullmask_bufs, std::numeric_limits::max(), _stream); } diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 1dbb9369115..0b76f3d3e8f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -50,7 +50,8 @@ class file_sink : public data_sink { } } - ~file_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~file_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -114,7 +115,8 @@ class host_buffer_sink : public data_sink { public: explicit host_buffer_sink(std::vector* buffer) : buffer_(buffer) {} - ~host_buffer_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~host_buffer_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e4313eba454..0be976b6144 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -32,6 +32,7 @@ #include #include +#include namespace cudf { namespace io { @@ -54,6 +55,30 @@ class file_source : public datasource { } } + std::unique_ptr host_read(size_t offset, size_t size) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + ssize_t const read_size = std::min(size, _file.size() - offset); + + std::vector v(read_size); + CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); + return buffer::create(std::move(v)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + auto const read_size = std::min(size, _file.size() - offset); + + CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + "read failed"); + return read_size; + } + ~file_source() override = default; [[nodiscard]] bool supports_device_read() const override @@ -138,40 +163,63 @@ class file_source : public datasource { */ class memory_mapped_source : public file_source { public: - explicit memory_mapped_source(char const* filepath, size_t offset, size_t size) + explicit memory_mapped_source(char const* filepath, + size_t offset, + size_t max_size_estimate, + size_t min_size_estimate) : file_source(filepath) { if (_file.size() != 0) { - map(_file.desc(), offset, size); - register_mmap_buffer(); + // Memory mapping is not exclusive, so we can include the whole region we expect to read + map(_file.desc(), offset, max_size_estimate); + // Buffer registration is exclusive (can't overlap with other registered buffers) so we + // register the lower estimate; this avoids issues when reading adjacent ranges from the same + // file from multiple threads + register_mmap_buffer(offset, min_size_estimate); } } ~memory_mapped_source() override { if (_map_addr != nullptr) { - munmap(_map_addr, _map_size); + unmap(); unregister_mmap_buffer(); } } std::unique_ptr host_read(size_t offset, size_t size) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); + + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size); + } - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is only partially within the registered region, copy to a new + // host buffer to make the data safe to copy to the device + if (_reg_addr != nullptr and + (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) { + auto const src = static_cast(_map_addr) + (offset - _map_offset); + + return std::make_unique>>( + std::vector(src, src + read_size)); + } return std::make_unique( - static_cast(_map_addr) + (offset - _map_offset), read_size); + static_cast(_map_addr) + offset - _map_offset, read_size); } size_t host_read(size_t offset, size_t size, uint8_t* dst) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size, dst); + } auto const src = static_cast(_map_addr) + (offset - _map_offset); std::memcpy(dst, src, read_size); @@ -184,16 +232,18 @@ class memory_mapped_source : public file_source { * * Fixes nvbugs/4215160 */ - void register_mmap_buffer() + void register_mmap_buffer(size_t offset, size_t size) { - if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) { - return; - } + if (_map_addr == nullptr or not pageableMemoryAccessUsesHostPageTables()) { return; } - auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault); - if (result == cudaSuccess) { - _is_map_registered = true; - } else { + // Registered region must be within the mapped region + _reg_offset = std::max(offset, _map_offset); + _reg_size = std::min(size != 0 ? size : _map_size, (_map_offset + _map_size) - _reg_offset); + + _reg_addr = static_cast(_map_addr) - _map_offset + _reg_offset; + auto const result = cudaHostRegister(_reg_addr, _reg_size, cudaHostRegisterReadOnly); + if (result != cudaSuccess) { + _reg_addr = nullptr; CUDF_LOG_WARN("cudaHostRegister failed with {} ({})", static_cast(result), cudaGetErrorString(result)); @@ -205,10 +255,12 @@ class memory_mapped_source : public file_source { */ void unregister_mmap_buffer() { - if (not _is_map_registered) { return; } + if (_reg_addr == nullptr) { return; } - auto const result = cudaHostUnregister(_map_addr); - if (result != cudaSuccess) { + auto const result = cudaHostUnregister(_reg_addr); + if (result == cudaSuccess) { + _reg_addr = nullptr; + } else { CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})", static_cast(result), cudaGetErrorString(result)); @@ -226,52 +278,30 @@ class memory_mapped_source : public file_source { // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); + if (_map_size == 0) { return; } // Check if accessing a region within already mapped area _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); } - private: - size_t _map_size = 0; - size_t _map_offset = 0; - void* _map_addr = nullptr; - bool _is_map_registered = false; -}; - -/** - * @brief Implementation class for reading from a file using `read` calls - * - * Potentially faster than `memory_mapped_source` when only a small portion of the file is read - * through the host. - */ -class direct_read_source : public file_source { - public: - explicit direct_read_source(char const* filepath) : file_source(filepath) {} - - std::unique_ptr host_read(size_t offset, size_t size) override + void unmap() { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - ssize_t const read_size = std::min(size, _file.size() - offset); - - std::vector v(read_size); - CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); - return buffer::create(std::move(v)); + if (_map_addr != nullptr) { + auto const result = munmap(_map_addr, _map_size); + if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); } + _map_addr = nullptr; + } } - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - auto const read_size = std::min(size, _file.size() - offset); + private: + size_t _map_offset = 0; + size_t _map_size = 0; + void* _map_addr = nullptr; - CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), - "read failed"); - return read_size; - } + size_t _reg_offset = 0; + size_t _reg_size = 0; + void* _reg_addr = nullptr; }; /** @@ -431,16 +461,21 @@ class user_datasource_wrapper : public datasource { std::unique_ptr datasource::create(std::string const& filepath, size_t offset, - size_t size) + size_t max_size_estimate, + size_t min_size_estimate) { + CUDF_EXPECTS(max_size_estimate == 0 or min_size_estimate <= max_size_estimate, + "Invalid min/max size estimates for datasource creation"); + #ifdef CUFILE_FOUND if (cufile_integration::is_always_enabled()) { // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); + return std::make_unique(filepath.c_str()); } #endif // Use our own memory mapping implementation for direct file reads - return std::make_unique(filepath.c_str(), offset, size); + return std::make_unique( + filepath.c_str(), offset, max_size_estimate, min_size_estimate); } std::unique_ptr datasource::create(host_buffer const& buffer) diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp index d9eac423901..1d8b34addbd 100644 --- a/cpp/src/io/utilities/hostdevice_span.hpp +++ b/cpp/src/io/utilities/hostdevice_span.hpp @@ -43,8 +43,8 @@ class hostdevice_span { template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> + std::remove_pointer_t().host_ptr())> (*)[], // NOLINT + T (*)[]>>* = nullptr> // NOLINT constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) { } @@ -54,8 +54,8 @@ class hostdevice_span { template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> + std::remove_pointer_t().host_ptr())> (*)[], // NOLINT + T (*)[]>>* = nullptr> // NOLINT constexpr hostdevice_span(C const& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) { diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index d8c1b50a94b..21708e48a25 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -126,6 +126,43 @@ std::unique_ptr findall(strings_column_view const& input, mr); } +namespace { +struct find_re_fn { + column_device_view d_strings; + + __device__ size_type operator()(size_type const idx, + reprog_device const prog, + int32_t const thread_idx) const + { + if (d_strings.is_null(idx)) { return 0; } + auto const d_str = d_strings.element(idx); + + auto const result = prog.find(thread_idx, d_str, d_str.begin()); + return result.has_value() ? result.value().first : -1; + } +}; +} // namespace + +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto results = make_numeric_column(data_type{type_to_id()}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + if (input.is_empty()) { return results; } + + auto d_results = results->mutable_view().data(); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); + auto const d_strings = column_device_view::create(input.parent(), stream); + launch_transform_kernel(find_re_fn{*d_strings}, *d_prog, d_results, input.size(), stream); + + return results; +} } // namespace detail // external API @@ -139,5 +176,14 @@ std::unique_ptr findall(strings_column_view const& input, return detail::findall(input, prog, stream, mr); } +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::find_re(input, prog, stream, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index a87ecb81b9d..997b0278fe2 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,9 @@ namespace nvtext { namespace detail { namespace { +// long strings threshold found with benchmarking +constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64; + /** * @brief Generate ngrams from strings column. * @@ -173,33 +177,39 @@ constexpr cudf::thread_index_type bytes_per_thread = 4; /** * @brief Counts the number of ngrams in each row of the given strings column * - * Each warp processes a single string. + * Each warp/thread processes a single string. * Formula is `count = max(0,str.length() - ngrams + 1)` * If a string has less than ngrams characters, its count is 0. */ CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + cudf::size_type tile_size, cudf::size_type* d_counts) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / tile_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { d_counts[str_idx] = 0; return; } + auto const d_str = d_strings.element(str_idx); + if (tile_size == 1) { + d_counts[str_idx] = cuda::std::max(0, (d_str.length() + 1 - ngrams)); + return; + } + namespace cg = cooperative_groups; auto const warp = cg::tiled_partition(cg::this_thread_block()); - auto const d_str = d_strings.element(str_idx); - auto const end = d_str.data() + d_str.size_bytes(); + auto const end = d_str.data() + d_str.size_bytes(); auto const lane_idx = warp.thread_rank(); cudf::size_type count = 0; for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end; - itr += cudf::detail::warp_size * bytes_per_thread) { + itr += tile_size * bytes_per_thread) { for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) { count += static_cast(cudf::strings::detail::is_begin_utf8_char(*s)); } @@ -256,19 +266,27 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Parameter ngrams should be an integer value of 2 or greater", std::invalid_argument); - auto const strings_count = input.size(); - if (strings_count == 0) { // if no strings, return an empty column - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (input.is_empty()) { // if no strings, return an empty column + return cudf::lists::detail::make_empty_lists_column( + cudf::data_type{cudf::type_id::STRING}, stream, mr); + } + if (input.size() == input.null_count()) { + return cudf::lists::detail::make_all_nulls_lists_column( + input.size(), cudf::data_type{cudf::type_id::STRING}, stream, mr); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto [offsets, total_ngrams] = [&] { - auto counts = rmm::device_uvector(input.size(), stream); - auto const num_blocks = cudf::util::div_rounding_up_safe( - static_cast(input.size()) * cudf::detail::warp_size, block_size); - count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + auto counts = rmm::device_uvector(input.size(), stream); + auto const avg_char_bytes = (input.chars_size(stream) / (input.size() - input.null_count())); + auto const tile_size = (avg_char_bytes < AVG_CHAR_BYTES_THRESHOLD) + ? 1 // thread per row + : cudf::detail::warp_size; // warp per row + auto const grid = cudf::detail::grid_1d( + static_cast(input.size()) * tile_size, block_size); + count_char_ngrams_kernel<<>>( + *d_strings, ngrams, tile_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); @@ -277,8 +295,8 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Insufficient number of characters in each string to generate ngrams"); character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets}; - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - generator, strings_count, total_ngrams, stream, mr); + auto [offsets_column, chars] = + cudf::strings::detail::make_strings_children(generator, input.size(), total_ngrams, stream, mr); auto output = cudf::make_strings_column( total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); @@ -368,7 +386,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto [offsets, total_ngrams] = [&] { auto counts = rmm::device_uvector(input.size(), stream); count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + *d_strings, ngrams, cudf::detail::warp_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 125b98c4a67..9d8e3cf2fa6 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -115,12 +115,19 @@ class fixed_pinned_pool_memory_resource { return !operator==(other); } - friend void get_property(fixed_pinned_pool_memory_resource const&, + // clang-tidy will complain about this function because it is completely + // unused at runtime and only exist for tag introspection by CCCL, so we + // ignore linting. This masks a real issue if we ever want to compile with + // clang, though, which is that the function will actually be compiled out by + // clang. If cudf were ever to try to support clang as a compile we would + // need to force the compiler to emit this symbol. The same goes for the + // other get_property definitions in this file. + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::device_accessible) noexcept { } - friend void get_property(fixed_pinned_pool_memory_resource const&, + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::host_accessible) noexcept { } @@ -235,7 +242,9 @@ class new_delete_memory_resource { bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); } + // NOLINTBEGIN friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {} + // NOLINTEND }; static_assert(cuda::mr::resource_with, diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp index d54f5677c4c..e52fffbd8c6 100644 --- a/cpp/src/utilities/logger.cpp +++ b/cpp/src/utilities/logger.cpp @@ -74,8 +74,10 @@ struct logger_wrapper { } // namespace -spdlog::logger& cudf::logger() +spdlog::logger& cudf::detail::logger() { static logger_wrapper wrapped{}; return wrapped.logger_; } + +spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b67d922d377..4596ec65ce7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -385,6 +385,8 @@ ConfigureTest( # * utilities tests ------------------------------------------------------------------------------- ConfigureTest( UTILITIES_TEST + utilities_tests/batched_memcpy_tests.cu + utilities_tests/batched_memset_tests.cu utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp @@ -395,7 +397,6 @@ ConfigureTest( utilities_tests/pinned_memory_tests.cpp utilities_tests/type_check_tests.cpp utilities_tests/type_list_tests.cpp - utilities_tests/batched_memset_tests.cu ) # ################################################################################################## diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index 06e0d193d80..aa5b49567e6 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -557,7 +557,11 @@ auto NullOp_Result(cudf::column_view lhs, cudf::column_view rhs) std::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(lhs.size()), result.begin(), - [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut { + [&lhs_data = lhs_data, + &lhs_mask = lhs_mask, + &rhs_data = rhs_data, + &rhs_mask = rhs_mask, + &result_mask = result_mask](auto i) -> TypeOut { auto lhs_valid = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i); auto rhs_valid = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i); bool output_valid = lhs_valid or rhs_valid; diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h index d36b48d666a..ef1ccfccab5 100644 --- a/cpp/tests/binaryop/util/operation.h +++ b/cpp/tests/binaryop/util/operation.h @@ -100,7 +100,7 @@ struct Mul { std::enable_if_t<(cudf::is_duration_t::value && std::is_integral_v) || (cudf::is_duration_t::value && std::is_integral_v), void>* = nullptr> - OutT DurationProduct(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationProduct(LhsT x, RhsT y) const { return x * y; } @@ -128,7 +128,7 @@ struct Div { typename LhsT, typename RhsT, std::enable_if_t<(std::is_integral_v || cudf::is_duration()), void>* = nullptr> - OutT DurationDivide(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationDivide(LhsT x, RhsT y) const { return x / y; } diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index 14b4197de71..631f5150829 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -340,7 +340,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask) cudf::column moved_to{std::move(original)}; - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); verify_column_views(moved_to); @@ -359,7 +359,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask) cudf::column moved_to{std::move(original)}; verify_column_views(moved_to); - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); // Verify move diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index bebd3d25610..aef0d4ad78a 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -370,11 +371,12 @@ TEST_F(SliceStringTableTest, StringWithNulls) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); diff --git a/cpp/tests/copying/slice_tests.cuh b/cpp/tests/copying/slice_tests.cuh index a180740f143..1e037294527 100644 --- a/cpp/tests/copying/slice_tests.cuh +++ b/cpp/tests/copying/slice_tests.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -148,7 +148,7 @@ std::vector create_expected_tables(cudf::size_type num_cols, } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; @@ -163,13 +163,12 @@ inline std::vector create_expected_string_co for (unsigned long index = 0; index < indices.size(); index += 2) { if (not nullable) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1])); + result.emplace_back(strings.begin() + indices[index], strings.begin() + indices[index + 1]); } else { auto valids = cudf::detail::make_counting_transform_iterator( indices[index], [](auto i) { return i % 2 == 0; }); - result.push_back(cudf::test::strings_column_wrapper( - strings.begin() + indices[index], strings.begin() + indices[index + 1], valids)); + result.emplace_back( + strings.begin() + indices[index], strings.begin() + indices[index + 1], valids); } } @@ -184,16 +183,16 @@ inline std::vector create_expected_string_co std::vector result = {}; for (unsigned long index = 0; index < indices.size(); index += 2) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1], - validity.begin() + indices[index])); + result.emplace_back(strings.begin() + indices[index], + strings.begin() + indices[index + 1], + validity.begin() + indices[index]); } return result; } inline std::vector create_expected_string_tables( - std::vector const strings[2], + std::vector> const strings, std::vector const& indices, bool nullable) { @@ -216,7 +215,7 @@ inline std::vector create_expected_string_tables( } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index ee3e7da5e0f..b56b0f2d3f8 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -135,7 +136,7 @@ std::vector create_expected_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], + std::vector> const strings, std::vector const& splits, bool nullable) { @@ -144,8 +145,8 @@ std::vector create_expected_string_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], - std::vector const validity[2], + std::vector> const strings, + std::vector> const validity, std::vector const& splits) { std::vector indices = splits_to_indices(splits, strings[0].size()); @@ -627,11 +628,12 @@ void split_string_with_invalids(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -658,11 +660,12 @@ void split_empty_output_strings_column_value(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -684,9 +687,9 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector splits{2, 5, 9}; @@ -699,16 +702,17 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) EXPECT_NO_THROW(Split(empty_table, splits)); } - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), no_valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), no_valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); scols.push_back(sw[1].release()); cudf::table src_table(std::move(scols)); auto result = Split(src_table, splits); - std::vector validity_masks[2] = {std::vector(strings[0].size()), - std::vector(strings[0].size())}; + std::vector> validity_masks{std::vector(strings[0].size()), + std::vector(strings[0].size())}; std::generate( validity_masks[1].begin(), validity_masks[1].end(), [i = 0]() mutable { return i++ % 2 == 0; }); @@ -1913,9 +1917,9 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes) cudf::size_type start = 0; auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector> cols; @@ -2377,7 +2381,7 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall) { // internally, contiguous split chunks GPU work in 1MB contiguous copies // so the output buffer must be 1MB or larger. - EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); + EXPECT_THROW(auto _ = cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); } TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 13577c4d0ea..603edb27c7c 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -196,6 +196,136 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns), fixed_width_column_wrapper{766, 424, 623}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1969, 1970, 1970}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{12, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{31, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{23, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{762, 0, 929}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{976, 23, 987}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{675, 432, 234}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{766, 424, 623}); } template diff --git a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp index 4fb8f78b558..0e68050f935 100644 --- a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,8 +22,6 @@ #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - using NumericTypesNoBools = cudf::test::Concat; diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index cc95c7a2f0f..8bc47c92c6b 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -23,8 +23,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA256HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA256HashTest, EmptyTable) diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index a4dc7531765..2151ec6e22f 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -270,9 +270,9 @@ TEST_F(FromArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -414,9 +414,9 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType) { std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); cudf::table_view expected_table_view = expected_table.view(); diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index cbfa4911c3c..ef9936b214c 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -309,9 +309,9 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 81c406c0faf..6e742b9e4cf 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -52,7 +52,7 @@ std::unique_ptr get_cudf_table() .release()); auto col4 = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( {true, false, true, false, true}, {true, false, true, true, false}) .release()); @@ -339,9 +339,9 @@ TEST_F(FromArrowTest, DictionaryIndicesType) std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 51216a8512c..7ba586461dc 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -55,7 +55,7 @@ get_nanoarrow_cudf_table(cudf::size_type length) auto col4 = cudf::test::fixed_width_column_wrapper( test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), test_data.bool_data.end(), test_data.bool_validity.begin()) @@ -82,8 +82,8 @@ get_nanoarrow_cudf_table(cudf::size_type length) test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back( @@ -575,9 +575,9 @@ TEST_F(ToArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp index fc0ed6c9352..fcb4433b42e 100644 --- a/cpp/tests/interop/to_arrow_host_test.cpp +++ b/cpp/tests/interop/to_arrow_host_test.cpp @@ -436,9 +436,9 @@ TEST_F(ToArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 90ae12cdd90..a6aa4b22eca 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -90,7 +90,7 @@ std::pair, std::shared_ptr> get_table auto col4 = cudf::test::fixed_width_column_wrapper( int64_data.begin(), int64_data.end(), validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( bool_data.begin(), bool_data.end(), bool_validity.begin()) .release()); @@ -112,8 +112,8 @@ std::pair, std::shared_ptr> get_table cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( bool_data_validity.begin(), bool_data_validity.end())); columns.emplace_back( @@ -294,9 +294,9 @@ TEST_F(ToArrowTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -438,7 +438,7 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge) auto const schema = std::make_shared(schema_vector); auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); - std::vector const metadata = {{"a"}}; + std::vector const metadata = {{"a"}}; // NOLINT ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table)); } } diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 840cf263ed9..54262dc3b44 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -39,19 +39,19 @@ using cudf::device_span; */ template struct DecompressTest : public cudf::test::BaseFixture { - std::vector vector_from_string(char const* str) const + [[nodiscard]] std::vector vector_from_string(std::string const str) const { - return std::vector(reinterpret_cast(str), - reinterpret_cast(str) + strlen(str)); + return {reinterpret_cast(str.c_str()), + reinterpret_cast(str.c_str()) + strlen(str.c_str())}; } - void Decompress(std::vector* decompressed, + void Decompress(std::vector& decompressed, uint8_t const* compressed, size_t compressed_size) { auto stream = cudf::get_default_stream(); rmm::device_buffer src{compressed, compressed_size, stream}; - rmm::device_uvector dst{decompressed->size(), stream}; + rmm::device_uvector dst{decompressed.size(), stream}; cudf::detail::hostdevice_vector> inf_in(1, stream); inf_in[0] = {static_cast(src.data()), src.size()}; @@ -67,7 +67,7 @@ struct DecompressTest : public cudf::test::BaseFixture { static_cast(this)->dispatch(inf_in, inf_out, inf_stat); CUDF_CUDA_TRY(cudaMemcpyAsync( - decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); + decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); inf_stat.device_to_host_sync(stream); ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS); } @@ -125,49 +125,57 @@ struct NvcompConfigTest : public cudf::test::BaseFixture {}; TEST_F(GzipDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0x1f, 0x8b, 0x8, 0x0, 0x9, 0x63, 0x99, 0x5c, 0x2, 0xff, 0xcb, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0x1, 0x0, 0x85, 0x11, 0x4a, 0xd, 0xb, 0x0, 0x0, 0x0}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x28, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, ShortLiteralAfterLongCopyAtStartup) { - constexpr char uncompressed[] = "Aaaaaaaaaaaah!"; + std::string const uncompressed{"Aaaaaaaaaaaah!"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = {14, 0x0, 'A', 0x0, 'a', (10 - 4) * 4 + 1, 1, 0x4, 'h', '!'}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(BrotliDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x5, 0x80, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x3}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index dc14824d834..b265dcf9273 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -63,9 +63,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using column = cudf::column; using table = cudf::table; using table_view = cudf::table_view; @@ -954,7 +954,7 @@ TEST_F(CsvReaderTest, Strings) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}, + std::vector{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}, view.column(1)); } @@ -1014,7 +1014,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"}, + std::vector{"\"abcdef ghi\"", R"("jkl ""mno"" pqr")", "stu \"vwx\" yz"}, view.column(1)); } @@ -1830,7 +1830,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) auto int_column = column_wrapper{10, 20, 30}; auto string_column = - column_wrapper{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}; + column_wrapper{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}; cudf::table_view input_table(std::vector{int_column, string_column}); // TODO add quoting style flag? @@ -2516,4 +2516,39 @@ TEST_F(CsvReaderTest, UTF8BOM) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected); } +void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs) +{ + ASSERT_EQ(lhs->size(), rhs->size()); + EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size())); +} + +TEST_F(CsvReaderTest, OutOfMapBoundsReads) +{ + // write a lot of data into a file + auto filepath = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv"; + auto const num_rows = 1 << 20; + auto const row = std::string{"0,1,2,3,4,5,6,7,8,9\n"}; + auto const file_size = num_rows * row.size(); + { + std::ofstream outfile(filepath, std::ofstream::out); + for (size_t i = 0; i < num_rows; ++i) { + outfile << row; + } + } + + // Only memory map the middle of the file + auto source = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4); + auto full_source = cudf::io::datasource::create(filepath); + auto const all_data = source->host_read(0, file_size); + auto ref_data = full_source->host_read(0, file_size); + expect_buffers_equal(ref_data.get(), all_data.get()); + + auto const start_data = source->host_read(file_size / 2, file_size / 2); + expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(), + start_data.get()); + + auto const end_data = source->host_read(0, file_size / 2 + 512); + expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 49ad0c408dc..cb6716f4a18 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -68,9 +68,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; cudf::test::TempDirTestEnvironment* const temp_env = static_cast( diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp index 2c4e29a01b9..39d31c406a5 100644 --- a/cpp/tests/io/json/json_writer.cpp +++ b/cpp/tests/io/json/json_writer.cpp @@ -70,6 +70,43 @@ TEST_F(JsonWriterTest, EmptyInput) EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); } +TEST_F(JsonWriterTest, EmptyLeaf) +{ + cudf::test::strings_column_wrapper col1{""}; + cudf::test::fixed_width_column_wrapper offsets{0, 0}; + auto col2 = make_lists_column(1, + offsets.release(), + cudf::test::strings_column_wrapper{}.release(), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); + auto col3 = cudf::test::lists_column_wrapper::make_one_empty_row_column(); + cudf::table_view tbl_view{{col1, *col2, col3}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + + // Empty columns in table + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + + // Empty columns in table - JSON Lines + out_buffer.clear(); + out_options.enable_lines(true); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})" + "\n"; + EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); +} + TEST_F(JsonWriterTest, ErrorCases) { cudf::test::strings_column_wrapper col1{"a", "b", "c"}; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 89e704f3ed3..cce0adbf317 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -43,9 +43,9 @@ template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using str_col = column_wrapper; using bool_col = column_wrapper; @@ -1358,21 +1358,22 @@ TEST_P(OrcWriterTestStripes, StripeSize) cols.push_back(col.release()); auto const expected = std::make_unique(std::move(cols)); - auto validate = [&](std::vector const& orc_buffer) { - auto const expected_stripe_num = - std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); - auto const stats = cudf::io::read_parsed_orc_statistics( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); - EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); - - cudf::io::orc_reader_options in_opts = - cudf::io::orc_reader_options::builder( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) - .use_index(false); - auto result = cudf::io::read_orc(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - }; + auto validate = + [&, &size_bytes = size_bytes, &size_rows = size_rows](std::vector const& orc_buffer) { + auto const expected_stripe_num = + std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); + auto const stats = cudf::io::read_parsed_orc_statistics( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); + EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); + + cudf::io::orc_reader_options in_opts = + cudf::io::orc_reader_options::builder( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) + .use_index(false); + auto result = cudf::io::read_orc(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + }; { std::vector out_buffer_chunked; diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index 8b03e94191e..f1286a00d22 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -98,7 +98,7 @@ TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) // list constexpr int vals_per_row = 4; auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( - 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + 0, [](cudf::size_type idx) { return idx * vals_per_row; }); cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, c1_offset_iter + num_rows + 1); cudf::test::fixed_width_column_wrapper c1_vals( diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index dc8e68b3a15..4a5309f3ba7 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -1189,15 +1189,12 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) cudf::test::fixed_width_column_wrapper values(value_iter, value_iter + num_values, validity); // ~256k values with num_nesting_levels = 16 - int total_values_produced = num_values; - auto prev_col = values.release(); + auto prev_col = values.release(); for (int idx = 0; idx < num_nesting_levels; idx++) { - auto const depth = num_nesting_levels - idx; auto const num_rows = (1 << (num_nesting_levels - idx)); auto offsets_iter = cudf::detail::make_counting_transform_iterator( - 0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; }); - total_values_produced += (num_rows + 1); + 0, [](cudf::size_type i) { return i * rows_per_level; }); cudf::test::fixed_width_column_wrapper offsets(offsets_iter, offsets_iter + num_rows + 1); diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 7c305235ea6..a0b48f54854 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -1302,24 +1302,24 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); std::array expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11}; - std::vector const expected_def_hists[] = {{1, 1, 2, 3}, - {1, 3, 10}, - {1, 1, 2, 10}, - {1, 1, 2, 2, 8}, - {1, 1, 1, 1, 10}, - {1, 1, 1, 1, 2, 8}, - {1, 3, 9}, - {1, 3, 1, 8}, - {1, 0, 4, 1, 1, 4, 9}}; - std::vector const expected_rep_hists[] = {{4, 3}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 5}, - {4, 4, 5}, - {4, 6, 2, 8}}; + std::vector> const expected_def_hists = {{1, 1, 2, 3}, + {1, 3, 10}, + {1, 1, 2, 10}, + {1, 1, 2, 2, 8}, + {1, 1, 1, 1, 10}, + {1, 1, 1, 1, 2, 8}, + {1, 3, 9}, + {1, 3, 1, 8}, + {1, 0, 4, 1, 1, 4, 9}}; + std::vector> const expected_rep_hists = {{4, 3}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 5}, + {4, 4, 5}, + {4, 6, 2, 8}}; auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet"); auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 8794f2ee304..6c5e9cdf07a 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -290,7 +290,8 @@ class custom_test_data_sink : public cudf::io::data_sink { CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); } - ~custom_test_data_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~custom_test_data_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -981,13 +982,15 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) { - std::vector truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, - {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_min{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, + {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; - std::vector truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, - {0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_max{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, + {0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; cudf::test::lists_column_wrapper col0{ {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}}; diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 93754091b3f..178edc52dd3 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -314,7 +314,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -362,7 +362,7 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -398,7 +398,7 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -423,7 +423,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}}; strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true}); @@ -468,7 +468,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; auto col0_gold_names_col = strcol_wrapper{ "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Ãœberwald"}; diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index 97979e79010..bea044496b3 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -97,7 +97,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyColumns) "hi", "hj"}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -296,7 +296,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns) true, false, false}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 2e09f25b51f..6208d395f0a 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -349,7 +349,7 @@ TYPED_TEST(MergeTest_, Merge1KeyColumns) cudf::test::fixed_width_column_wrapper expectedDataWrap1(seq_out1, seq_out1 + outputRows); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -452,7 +452,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) cudf::size_type inputRows = 40; // data: 0 2 4 6 | valid: 1 1 1 0 - auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 0; // <- no shortcut to this can avoid compiler errors } else { @@ -465,7 +465,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) leftColWrap1(sequence1, sequence1 + inputRows, valid_sequence1); // data: 1 3 5 7 | valid: 1 1 1 0 - auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 1; } else diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp index 19996f827cf..bc0321bd40a 100644 --- a/cpp/tests/reductions/segmented_reduction_tests.cpp +++ b/cpp/tests/reductions/segmented_reduction_tests.cpp @@ -1092,11 +1092,10 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets) auto aggregates = std::vector>>(); - aggregates.push_back(std::move(cudf::make_max_aggregation())); - aggregates.push_back(std::move(cudf::make_min_aggregation())); - aggregates.push_back(std::move(cudf::make_sum_aggregation())); - aggregates.push_back( - std::move(cudf::make_product_aggregation())); + aggregates.push_back(cudf::make_max_aggregation()); + aggregates.push_back(cudf::make_min_aggregation()); + aggregates.push_back(cudf::make_sum_aggregation()); + aggregates.push_back(cudf::make_product_aggregation()); auto output_type = cudf::data_type{cudf::type_to_id()}; for (auto&& agg : aggregates) { diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index 1858cd7782e..b12bf08520f 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -356,7 +356,7 @@ void test_replace(cudf::host_span input_column, for (size_t i = 0; i < values_to_replace_column.size(); i++) { size_t k = 0; - auto pred = [=, &k, &reference_result, &expected_valid, &isReplaced](T element) { + auto pred = [=, &k, &expected_valid, &isReplaced](T element) { bool toBeReplaced = false; if (!isReplaced[k]) { if (!input_has_nulls || expected_valid[k]) { @@ -503,7 +503,7 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest) const size_t REPLACE_SIZE = 10000; thrust::host_vector input_column(DATA_SIZE); - std::generate(std::begin(input_column), std::end(input_column), [REPLACE_SIZE]() { + std::generate(std::begin(input_column), std::end(input_column), []() { return std::rand() % (REPLACE_SIZE); }); diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index f702dc78371..165e0347785 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -214,7 +214,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_list_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); @@ -338,7 +338,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 4, 8, 12, 12, 12}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -373,7 +373,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 3, 5, 8, 8, 8}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -1499,7 +1499,7 @@ TYPED_TEST(TypedCollectSetTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_set_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2}, {1, 2, 4}, {2, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index ec726878b34..0eaab0c9f7a 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,11 @@ using cudf::test::iterators::nulls_at; auto constexpr null = int32_t{0}; // NULL representation for int32_t; +// clang-tidy doesn't think std::transform can handle a +// thrust::constant_iterator, so this is a workaround that uses nulls_at +// instead of no_nulls +auto no_nulls_list() { return nulls_at({}); } + struct OffsetRowWindowTest : public cudf::test::BaseFixture { static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -210,7 +215,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), - lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls}); + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, + no_nulls_list()}); } TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) @@ -250,7 +256,7 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}}, - no_nulls}); + no_nulls_list()}); } // To test that preceding bounds are clamped correctly at group boundaries. diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index c2c22986975..6e0dc16dca9 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -541,7 +541,7 @@ class RollingTest : public cudf::test::BaseFixture { agg_op op; for (cudf::size_type i = 0; i < num_rows; i++) { - OutputType val = agg_op::template identity(); + auto val = agg_op::template identity(); // load sizes min_periods = std::max(min_periods, 1); // at least one observation is required diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp index 2d37de920d5..2b79911a95a 100644 --- a/cpp/tests/scalar/scalar_test.cpp +++ b/cpp/tests/scalar/scalar_test.cpp @@ -190,7 +190,7 @@ TEST_F(ListScalarTest, MoveConstructorNonNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(data_ptr, s2.view().data()); - EXPECT_EQ(s.view().data(), nullptr); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT } TEST_F(ListScalarTest, MoveConstructorNested) @@ -205,8 +205,8 @@ TEST_F(ListScalarTest, MoveConstructorNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(offset_ptr, s2.view().child(0).data()); EXPECT_EQ(data_ptr, s2.view().child(1).data()); - EXPECT_EQ(s.view().data(), nullptr); - EXPECT_EQ(s.view().num_children(), 0); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT + EXPECT_EQ(s.view().num_children(), 0); // NOLINT } struct StructScalarTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp index 48711c21715..7584003e800 100644 --- a/cpp/tests/search/search_list_test.cpp +++ b/cpp/tests/search/search_list_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,6 @@ using strings_col = cudf::test::strings_column_wrapper; constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; constexpr int32_t null{0}; // Mark for null child elements at the current level -constexpr int32_t XXX{0}; // Mark for null elements at all levels using TestTypes = cudf::test::Concat> grand_child; - grand_child.push_back(std::move(col4.release())); + grand_child.push_back(col4.release()); auto child_col_2 = cudf::make_structs_column(6, std::move(grand_child), 0, rmm::device_buffer{}); child_columns2.push_back(std::move(child_col_2)); auto struct_col3 = diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp index 4d7d23dc881..d5b6915b520 100644 --- a/cpp/tests/stream_compaction/unique_tests.cpp +++ b/cpp/tests/stream_compaction/unique_tests.cpp @@ -43,7 +43,6 @@ auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; auto constexpr KEEP_LAST = cudf::duplicate_keep_option::KEEP_LAST; auto constexpr KEEP_NONE = cudf::duplicate_keep_option::KEEP_NONE; -auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; using int32s_col = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp index 443f4548b2c..07b2d77cc04 100644 --- a/cpp/tests/streams/stream_compaction_test.cpp +++ b/cpp/tests/streams/stream_compaction_test.cpp @@ -29,8 +29,6 @@ #include -auto constexpr null{0}; // null at current level -auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp index 52839c6fc9f..e5a1ee0988c 100644 --- a/cpp/tests/streams/strings/find_test.cpp +++ b/cpp/tests/streams/strings/find_test.cpp @@ -46,4 +46,5 @@ TEST_F(StringsFindTest, Find) auto const pattern = std::string("[a-z]"); auto const prog = cudf::strings::regex_program::create(pattern); cudf::strings::findall(view, *prog, cudf::test::get_default_stream()); + cudf::strings::find_re(view, *prog, cudf::test::get_default_stream()); } diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 73da4d081e2..4821a7fa999 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -149,6 +150,22 @@ TEST_F(StringsFindallTests, LargeRegex) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, FindTest) +{ + auto const valids = cudf::test::iterators::null_at(5); + cudf::test::strings_column_wrapper input( + {"3A", "May4", "Jan2021", "March", "A9BC", "", "", "abcdef ghijklm 12345"}, valids); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("\\d+"); + + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::find_re(sv, *prog); + auto expected = + cudf::test::fixed_width_column_wrapper({0, 3, 3, -1, 1, 0, -1, 15}, valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + TEST_F(StringsFindallTests, NoMatches) { cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); @@ -169,10 +186,16 @@ TEST_F(StringsFindallTests, EmptyTest) auto prog = cudf::strings::regex_program::create(pattern); cudf::test::strings_column_wrapper input; - auto sv = cudf::strings_column_view(input); - auto results = cudf::strings::findall(sv, *prog); - - using LCW = cudf::test::lists_column_wrapper; - LCW expected; - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + auto sv = cudf::strings_column_view(input); + { + auto results = cudf::strings::findall(sv, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } + { + auto results = cudf::strings::find_re(sv, *prog); + auto expected = cudf::test::fixed_width_column_wrapper{}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } } diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index ce5f68de3c9..26bcfe8028d 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -425,7 +426,7 @@ TYPED_TEST(StringsIntegerConvertTest, IntegerToHex) if (v == 0) { return std::string("00"); } // special handling for single-byte types if constexpr (std::is_same_v || std::is_same_v) { - char const hex_digits[16] = { + std::array const hex_digits = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; std::string str; str += hex_digits[(v & 0xF0) >> 4]; diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index f0010fc1ed9..219bd6d8b01 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -635,9 +635,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild) auto mask_vec = std::vector{true, false, false}; auto [null_mask, null_count] = cudf::test::detail::make_null_mask(mask_vec.begin(), mask_vec.end()); - auto structs_col = - cudf::make_structs_column(num_rows, std::move(cols), null_count, std::move(null_mask)); - EXPECT_NO_THROW(structs_col->view()); + EXPECT_NO_THROW(auto structs_col = cudf::make_structs_column( + num_rows, std::move(cols), null_count, std::move(null_mask))); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index 215ca158f37..2684123c08a 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -32,7 +32,7 @@ struct MaskToNullTest : public cudf::test::BaseFixture { { cudf::test::fixed_width_column_wrapper input_column( input.begin(), input.end(), val.begin()); - std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and()); + std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<>()); auto sample = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index 1785848ec77..0bdf5b321ac 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -47,7 +47,7 @@ void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool TEST_F(UnaryOperationIntegrationTest, Transform_FP32_FP32) { // c = a*a*a*a - char const* cuda = + std::string const cuda = R"***( __device__ inline void fdsf ( float* C, @@ -58,7 +58,7 @@ __device__ inline void fdsf ( } )***"; - char const* ptx = + std::string const ptx = R"***( // // Generated by NVIDIA NVVM Compiler @@ -101,17 +101,17 @@ __device__ inline void fdsf ( auto op = [](dtype a) { return a * a * a * a; }; auto data_init = [](cudf::size_type row) { return row % 3; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) { // c = a * a - a - char const cuda[] = + std::string const cuda = "__device__ inline void f(int* output,int input){*output = input*input - input;}"; - char const* ptx = + std::string const ptx = R"***( .func _Z1fPii( .param .b64 _Z1fPii_param_0, @@ -136,8 +136,8 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) auto op = [](dtype a) { return a * a - a; }; auto data_init = [](cudf::size_type row) { return row % 78; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) @@ -145,7 +145,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) // Capitalize all the lower case letters // Assuming ASCII, the PTX code is compiled from the following CUDA code - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f( signed char* output, @@ -159,7 +159,7 @@ __device__ inline void f( } )***"; - char const ptx[] = + std::string const ptx = R"***( .func _Z1fPcc( .param .b64 _Z1fPcc_param_0, @@ -191,15 +191,15 @@ __device__ inline void f( auto op = [](dtype a) { return std::toupper(a); }; auto data_init = [](cudf::size_type row) { return 'a' + (row % 26); }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_Datetime) { // Add one day to timestamp in microseconds - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) { @@ -217,7 +217,7 @@ __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) auto random_eng = cudf::test::UniformRandomGenerator(0, 100000000); auto data_init = [&random_eng](cudf::size_type row) { return random_eng.generate(); }; - test_udf(cuda, op, data_init, 500, false); + test_udf(cuda.c_str(), op, data_init, 500, false); } } // namespace transformation diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu new file mode 100644 index 00000000000..98657f8e224 --- /dev/null +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +template +struct BatchedMemcpyTest : public cudf::test::BaseFixture {}; + +TEST(BatchedMemcpyTest, BasicTest) +{ + using T1 = int64_t; + + // Device init + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + // Buffer lengths (in number of elements) + std::vector const h_lens{ + 50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000}; + + // Total number of buffers + auto const num_buffs = h_lens.size(); + + // Exclusive sum of buffer lengths for pointers + std::vector h_lens_excl_sum(num_buffs); + std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0); + + // Corresponding buffer sizes (in bytes) + std::vector h_sizes_bytes; + h_sizes_bytes.reserve(num_buffs); + std::transform( + h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) { + return size * sizeof(T1); + }); + + // Initialize random engine + auto constexpr seed = 0xcead; + std::mt19937 engine{seed}; + using uniform_distribution = + typename std::conditional_t, + std::bernoulli_distribution, + std::conditional_t, + std::uniform_real_distribution, + std::uniform_int_distribution>>; + uniform_distribution dist{}; + + // Generate a src vector of random data vectors + std::vector> h_sources; + h_sources.reserve(num_buffs); + std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) { + std::vector data(size); + std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; }); + return data; + }); + // Copy the vectors to device + std::vector> h_device_vecs; + h_device_vecs.reserve(h_sources.size()); + std::transform( + h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) { + return cudf::detail::make_device_uvector_async(vec, stream, mr); + }); + // Pointers to the source vectors + std::vector h_src_ptrs; + h_src_ptrs.reserve(h_sources.size()); + std::transform( + h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) { + return static_cast(vec.data()); + }); + // Copy the source data pointers to device + auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr); + + // Total number of elements in all buffers + auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0); + + // Create one giant buffer for destination + auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async(total_buff_len, stream, mr); + // Pointers to destination buffers within the giant destination buffer + std::vector h_dst_ptrs(num_buffs); + std::for_each(thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(num_buffs), + [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; }); + // Copy destination data pointers to device + auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr); + + // Copy buffer size iterators (in bytes) to device + auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr); + + // Run the batched memcpy + cudf::detail::batched_memcpy_async( + d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream); + + // Expected giant destination buffer after the memcpy + std::vector expected_buffer; + expected_buffer.reserve(total_buff_len); + std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) { + expected_buffer.insert(expected_buffer.end(), source.begin(), source.end()); + }); + + // Copy over the result destination buffer to host and synchronize the stream + auto result_dst_buffer = + cudf::detail::make_std_vector_sync(cudf::device_span(d_dst_data), stream); + + // Check if both vectors are equal + EXPECT_TRUE( + std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin())); +} diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu index bed0f40d70e..0eeb7b95318 100644 --- a/cpp/tests/utilities_tests/batched_memset_tests.cu +++ b/cpp/tests/utilities_tests/batched_memset_tests.cu @@ -18,8 +18,8 @@ #include #include +#include #include -#include #include #include #include @@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1) }); // Function Call - cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream); + cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream); // Set all buffer regions to 0 for expected comparison std::for_each( diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index d052e20eedb..cfab570833b 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture { std::vector prev_sinks; public: - LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()} + LoggerTest() + : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} { - cudf::logger().sinks() = {std::make_shared(oss)}; - cudf::logger().set_formatter( + cudf::detail::logger().sinks() = {std::make_shared(oss)}; + cudf::detail::logger().set_formatter( std::unique_ptr(new spdlog::pattern_formatter("%v"))); } ~LoggerTest() override { - cudf::logger().set_level(prev_level); - cudf::logger().sinks() = prev_sinks; + cudf::detail::logger().set_level(prev_level); + cudf::detail::logger().sinks() = prev_sinks; } void clear_sink() { oss.str(""); } @@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::logger().critical("crit msg"); + cudf::detail::logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); - cudf::logger().info("info"); - cudf::logger().warn("warn"); - cudf::logger().error("error"); - cudf::logger().critical("critical"); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); + cudf::detail::logger().error("error"); + cudf::detail::logger().critical("critical"); ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::logger().set_level(spdlog::level::warn); - cudf::logger().info("info"); - cudf::logger().warn("warn"); + cudf::detail::logger().set_level(spdlog::level::warn); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::logger().set_level(spdlog::level::debug); - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); + cudf::detail::logger().set_level(spdlog::level::debug); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/dependencies.yaml b/dependencies.yaml index b192158c4ea..3561b22965d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -602,7 +602,7 @@ dependencies: packages: - fsspec>=0.6.0 - &numpy numpy>=1.23,<3.0a0 - - pandas>=2.0,<2.2.3dev0 + - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: - output_types: [conda, requirements, pyproject] @@ -748,6 +748,10 @@ dependencies: packages: - *numba-cuda-dep - pandas==2.0.* + - matrix: {dependencies: "latest"} + packages: + - numba-cuda==0.0.15 + - pandas==2.2.3 - matrix: packages: - output_types: conda diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e21536e2e97..052479d6720 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -49,3 +49,4 @@ This page provides API documentation for pylibcudf. io/index.rst strings/index.rst + nvtext/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst new file mode 100644 index 00000000000..abb45e426a8 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst @@ -0,0 +1,6 @@ +============= +edit_distance +============= + +.. automodule:: pylibcudf.nvtext.edit_distance + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst new file mode 100644 index 00000000000..b5cd5ee42c3 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -0,0 +1,7 @@ +nvtext +====== + +.. toctree:: + :maxdepth: 1 + + edit_distance diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index e73ea3370ec..48dc8a13c3e 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -11,10 +11,13 @@ strings find find_multiple findall + padding regex_flags regex_program repeat replace + side_type slice split strip + wrap diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst new file mode 100644 index 00000000000..5b417024fd5 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst @@ -0,0 +1,6 @@ +======= +padding +======= + +.. automodule:: pylibcudf.strings.padding + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst new file mode 100644 index 00000000000..d5aef9c4f75 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst @@ -0,0 +1,6 @@ +========= +side_type +========= + +.. automodule:: pylibcudf.strings.side_type + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst new file mode 100644 index 00000000000..bd825f78568 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst @@ -0,0 +1,6 @@ +==== +wrap +==== + +.. automodule:: pylibcudf.strings.wrap + :members: diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index bc5e085ec39..d844466120f 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -13,12 +13,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.filling cimport calendrical_month_sequence from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.datetime import DatetimeComponent from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar -import pylibcudf as plc - @acquire_spill_lock() def add_months(Column col, Column months): @@ -40,9 +39,39 @@ def add_months(Column col, Column months): @acquire_spill_lock() def extract_datetime_component(Column col, object field): - result = Column.from_pylibcudf( - plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) - ) + + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + cdef libcudf_datetime.datetime_component component + + component_names = { + "year": DatetimeComponent.YEAR, + "month": DatetimeComponent.MONTH, + "day": DatetimeComponent.DAY, + "weekday": DatetimeComponent.WEEKDAY, + "hour": DatetimeComponent.HOUR, + "minute": DatetimeComponent.MINUTE, + "second": DatetimeComponent.SECOND, + "millisecond": DatetimeComponent.MILLISECOND, + "microsecond": DatetimeComponent.MICROSECOND, + "nanosecond": DatetimeComponent.NANOSECOND, + } + if field == "day_of_year": + with nogil: + c_result = move(libcudf_datetime.day_of_year(col_view)) + elif field in component_names: + component = component_names[field] + with nogil: + c_result = move( + libcudf_datetime.extract_datetime_component( + col_view, + component + ) + ) + else: + raise ValueError(f"Invalid field: '{field}'") + + result = Column.from_unique_ptr(move(c_result)) if field == "weekday": # Pandas counts Monday-Sunday as 0-6 diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx index e3c2273345a..3dd99c42d76 100644 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx @@ -2,37 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) +from pylibcudf cimport nvtext from cudf._lib.column cimport Column @acquire_spill_lock() def edit_distance(Column strings, Column targets): - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def edit_distance_matrix(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance_matrix( + strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..d9595f4ab0a 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -3,25 +3,13 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from libcpp.memory cimport unique_ptr -from libcpp.string cimport string from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, to_floats as cpp_to_floats, @@ -33,11 +21,6 @@ from pylibcudf.libcudf.strings.convert.convert_integers cimport ( is_hex as cpp_is_hex, to_integers as cpp_to_integers, ) -from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( - integers_to_ipv4 as cpp_integers_to_ipv4, - ipv4_to_integers as cpp_ipv4_to_integers, - is_ipv4 as cpp_is_ipv4, -) from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id @@ -427,77 +410,21 @@ def stoul(Column input_col): return string_to_integer(input_col, cudf.dtype("uint64")) -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) - - def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_booleans.to_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + ) + return Column.from_pylibcudf(plc_column) def from_booleans(Column input_col): - return _from_booleans(input_col) + plc_column = plc.strings.convert.convert_booleans.from_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + as_device_scalar("False").c_value, + ) + return Column.from_pylibcudf(plc_column) def int2timestamp( @@ -520,11 +447,10 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef string c_timestamp_format = format.encode("UTF-8") return Column.from_pylibcudf( plc.strings.convert.convert_datetime.from_timestamps( input_col.to_pylibcudf(mode="read"), - c_timestamp_format, + format, names.to_pylibcudf(mode="read") ) ) @@ -545,12 +471,11 @@ def timestamp2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_datetime.to_timestamps( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -572,16 +497,11 @@ def istimestamp(Column input_col, str format): """ if input_col.size == 0: return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_datetime.is_timestamp( + input_col.to_pylibcudf(mode="read"), + format + ) + return Column.from_pylibcudf(plc_column) def timedelta2int(Column input_col, dtype, format): @@ -599,12 +519,11 @@ def timedelta2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.to_durations( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -623,12 +542,10 @@ def int2timedelta(Column input_col, str format): A Column with Timedelta represented in string format """ - - cdef string c_duration_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.from_durations( input_col.to_pylibcudf(mode="read"), - c_duration_format + format ) ) @@ -646,14 +563,10 @@ def int2ip(Column input_col): A Column with integer represented in string ipv4 format """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_ipv4(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def ip2int(Column input_col): @@ -669,14 +582,10 @@ def ip2int(Column input_col): A Column with ipv4 represented as integer """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ipv4_to_integers(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def is_ipv4(Column source_strings): @@ -685,15 +594,10 @@ def is_ipv4(Column source_strings): that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn where nnn is integer digits in [0,255]. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_ipv4( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.is_ipv4( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def htoi(Column input_col, **kwargs): diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 4bf8a9b1a8f..e712937f816 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -71,16 +71,9 @@ startswith_multiple, ) from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import findall +from cudf._lib.strings.findall import find_re, findall from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object -from cudf._lib.strings.padding import ( - SideType, - center, - ljust, - pad, - rjust, - zfill, -) +from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( insert, diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index a8df8c9a92c..96dcd021c3b 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -1,22 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import cudf - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport ( - from_fixed_point as cpp_from_fixed_point, - is_fixed_point as cpp_is_fixed_point, - to_fixed_point as cpp_to_fixed_point, -) -from pylibcudf.libcudf.types cimport data_type, type_id - from cudf._lib.column cimport Column +from cudf._lib.types cimport dtype_to_pylibcudf_type + +import pylibcudf as plc @acquire_spill_lock() @@ -32,14 +21,10 @@ def from_decimal(Column input_col): ------- A column of strings representing the input decimal values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_fixed_point( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type): ------- A column of decimals parsed from the string values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef int scale = out_type.scale - cdef data_type c_out_type - if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(type_id.DECIMAL32, -scale) - elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(type_id.DECIMAL64, -scale) - elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(type_id.DECIMAL128, -scale) - else: - raise TypeError("should be a decimal dtype") - with nogil: - c_result = move( - cpp_to_fixed_point( - input_column_view, - c_out_type)) - - result = Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type), + ) + result = Column.from_pylibcudf(plc_column) result.dtype.precision = out_type.precision return result @@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype): ------- A Column of booleans indicating valid decimal conversion. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = input_col.view() - cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) - with nogil: - c_result = move(cpp_is_fixed_point( - source_view, - c_dtype - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(dtype), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 0e758d5b322..3e7a504d535 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -23,3 +23,19 @@ def findall(Column source_strings, object pattern, uint32_t flags): prog, ) return Column.from_pylibcudf(plc_result) + + +@acquire_spill_lock() +def find_re(Column source_strings, object pattern, uint32_t flags): + """ + Returns character positions where the pattern first matches + the elements in source_strings. + """ + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.find_re( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx index d0239e91ec3..015a2ebab8a 100644 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ b/python/cudf/cudf/_lib/strings/padding.pyx @@ -1,64 +1,31 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from enum import IntEnum - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.strings.padding cimport ( - pad as cpp_pad, - zfill as cpp_zfill, -) -from pylibcudf.libcudf.strings.side_type cimport ( - side_type, - underlying_type_t_side_type, -) - - -class SideType(IntEnum): - LEFT = side_type.LEFT - RIGHT = side_type.RIGHT - BOTH = side_type.BOTH +import pylibcudf as plc @acquire_spill_lock() def pad(Column source_strings, size_type width, fill_char, - side=SideType.LEFT): + side=plc.strings.side_type.SideType.LEFT): """ Returns a Column by padding strings in `source_strings` up to the given `width`. Direction of padding is to be specified by `side`. The additional characters being filled can be changed by specifying `fill_char`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - cdef side_type pad_direction = ( - side + plc_result = plc.strings.padding.pad( + source_strings.to_pylibcudf(mode="read"), + width, + side, + fill_char, ) - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - pad_direction, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -68,19 +35,13 @@ def zfill(Column source_strings, Returns a Column by prepending strings in `source_strings` with '0' characters up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_zfill( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.padding.zfill( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) -@acquire_spill_lock() def center(Column source_strings, size_type width, fill_char): @@ -89,23 +50,9 @@ def center(Column source_strings, in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.BOTH, - f_char - )) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH) - return Column.from_unique_ptr(move(c_result)) - -@acquire_spill_lock() def ljust(Column source_strings, size_type width, fill_char): @@ -113,23 +60,9 @@ def ljust(Column source_strings, Returns a Column by filling right side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT) - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.RIGHT, - f_char - )) - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() def rjust(Column source_strings, size_type width, fill_char): @@ -137,17 +70,4 @@ def rjust(Column source_strings, Returns a Column by filling left side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.LEFT, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index 38ecb21a94c..982c5a600e7 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -1,18 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.side_type cimport side_type -from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar import pylibcudf as plc @@ -24,15 +14,12 @@ def strip(Column source_strings, The set of characters need be stripped from left and right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - return Column.from_pylibcudf( - plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.SideType.BOTH, - repl.c_value - ) + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.BOTH, + py_repl.device_value.c_value, ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -43,24 +30,12 @@ def lstrip(Column source_strings, The set of characters need be stripped from left side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.LEFT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.LEFT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -71,21 +46,9 @@ def rstrip(Column source_strings, The set of characters need be stripped from right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.RIGHT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.RIGHT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx index eed5cf33b10..2b40f01f818 100644 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ b/python/cudf/cudf/_lib/strings/wrap.pyx @@ -1,17 +1,13 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def wrap(Column source_strings, @@ -21,14 +17,8 @@ def wrap(Column source_strings, in the Column to be formatted in paragraphs with length less than a given `width`. """ - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_wrap( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.wrap.wrap( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e2bdecbe67a..871ffc6269d 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -3,7 +3,7 @@ import pandas as pd from packaging import version -PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2") +PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3") PANDAS_VERSION = version.parse(pd.__version__) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d0ea4612a1b..2c9b0baa9b6 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -480,6 +480,11 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: if dtype == self.dtype: return self + elif isinstance(dtype, pd.DatetimeTZDtype): + raise TypeError( + "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. " + "Use tz_localize instead." + ) return libcudf.unary.cast(self, dtype=dtype) def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override] @@ -940,6 +945,16 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: def as_string_column(self) -> cudf.core.column.StringColumn: return self._local_time.as_string_column() + def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: + if isinstance(dtype, pd.DatetimeTZDtype) and dtype != self.dtype: + if dtype.unit != self.time_unit: + # TODO: Doesn't check that new unit is valid. + casted = self._with_type_metadata(dtype) + else: + casted = self + return casted.tz_convert(str(dtype.tz)) + return super().as_datetime_column(dtype) + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component( self._local_time, field diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index da422db5eae..b50e23bd52e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -11,6 +11,8 @@ import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf import cudf.api.types from cudf import _lib as libcudf @@ -2966,7 +2968,7 @@ def pad( raise TypeError(msg) try: - side = libstrings.SideType[side.upper()] + side = plc.strings.side_type.SideType[side.upper()] except KeyError: raise ValueError( "side has to be either one of {'left', 'right', 'both'}" @@ -3624,6 +3626,46 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) + def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: + """ + Find first occurrence of pattern or regular expression in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 (no flags) + Flags to pass through to the regex engine (e.g. re.MULTILINE) + + Returns + ------- + Series + A Series of position values where the pattern first matches + each string. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit', 'Cat']) + >>> s.str.find_re('[ti]') + 0 1 + 1 -1 + 2 4 + 3 2 + dtype: int32 + """ + if isinstance(pat, re.Pattern): + flags = pat.flags & ~re.U + pat = pat.pattern + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "Unsupported value for `flags` parameter" + ) + + data = libstrings.find_re(self._column, pat, flags) + return self._return_or_inplace(data) + def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ Find all first occurrences of patterns in the Series/Index. diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 6e5abb2b82b..3d132c92d54 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -681,7 +681,7 @@ def _tile(A, reps): nval = len(value_vars) dtype = min_unsigned_type(nval) - if not var_name: + if var_name is None: var_name = "variable" if not value_vars: diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index cea86a5499e..691da224f44 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -266,3 +266,25 @@ def test_pandas_compatible_non_zoneinfo_raises(klass): with cudf.option_context("mode.pandas_compatible", True): with pytest.raises(NotImplementedError): cudf.from_pandas(pandas_obj) + + +def test_astype_naive_to_aware_raises(): + ser = cudf.Series([datetime.datetime(2020, 1, 1)]) + with pytest.raises(TypeError): + ser.astype("datetime64[ns, UTC]") + with pytest.raises(TypeError): + ser.to_pandas().astype("datetime64[ns, UTC]") + + +@pytest.mark.parametrize("unit", ["ns", "us"]) +def test_astype_aware_to_aware(unit): + ser = cudf.Series( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)] + ) + result = ser.astype(f"datetime64[{unit}, US/Pacific]") + expected = ser.to_pandas().astype(f"datetime64[{unit}, US/Pacific]") + zoneinfo_type = pd.DatetimeTZDtype( + expected.dtype.unit, zoneinfo.ZoneInfo(str(expected.dtype.tz)) + ) + expected = ser.astype(zoneinfo_type) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 4235affd4d1..3adbe1d2a74 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -119,6 +119,15 @@ def test_melt_str_scalar_id_var(): assert_eq(result, expected) +def test_melt_falsy_var_name(): + df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) + result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") + expected = pd.melt( + df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" + ) + assert_eq(result, expected) + + @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index cc88cc79769..45143211a11 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1899,6 +1899,26 @@ def test_string_findall(pat, flags): assert_eq(expected, actual) +@pytest.mark.parametrize( + "pat, flags, pos", + [ + ("Monkey", 0, [-1, 0, -1, -1]), + ("on", 0, [2, 1, -1, 1]), + ("bit", 0, [-1, -1, 3, -1]), + ("on$", 0, [2, -1, -1, -1]), + ("on$", re.MULTILINE, [2, -1, -1, 1]), + ("o.*k", re.DOTALL, [-1, 1, -1, 1]), + ], +) +def test_string_find_re(pat, flags, pos): + test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] + gs = cudf.Series(test_data) + + expected = pd.Series(pos, dtype=np.int32) + actual = gs.str.find_re(pat, flags) + assert_eq(expected, actual) + + def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) gs = cudf.Series(["hello", "goodbye"]) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 605f9be5a49..1b730ffd13c 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "ptxcompiler", "pyarrow>=14.0.0,<18.0.0a0", "pylibcudf==24.12.*,>=0.0.0a0", diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c401e5a2f17..a418560b31c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -914,7 +914,7 @@ def do_evaluate( col = self.children[0].evaluate(df, context=context, mapping=mapping) is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( - col.obj, format.encode() + col.obj, format ) if strict: @@ -937,7 +937,7 @@ def do_evaluate( ) return Column( plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype, format.encode() + res.columns()[0], self.dtype, format ) ) elif self.name == pl_expr.StringFunction.Replace: @@ -961,16 +961,16 @@ def do_evaluate( class TemporalFunction(Expr): __slots__ = ("name", "options", "children") _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { - pl_expr.TemporalFunction.Year: "year", - pl_expr.TemporalFunction.Month: "month", - pl_expr.TemporalFunction.Day: "day", - pl_expr.TemporalFunction.WeekDay: "weekday", - pl_expr.TemporalFunction.Hour: "hour", - pl_expr.TemporalFunction.Minute: "minute", - pl_expr.TemporalFunction.Second: "second", - pl_expr.TemporalFunction.Millisecond: "millisecond", - pl_expr.TemporalFunction.Microsecond: "microsecond", - pl_expr.TemporalFunction.Nanosecond: "nanosecond", + pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, + pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, + pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, + pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, + pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR, + pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE, + pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND, + pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, + pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, + pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, } _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -1003,8 +1003,12 @@ def do_evaluate( ] (column,) = columns if self.name == pl_expr.TemporalFunction.Microsecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) millis_as_micros = plc.binaryop.binary_operation( millis, plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), @@ -1019,9 +1023,15 @@ def do_evaluate( ) return Column(total_micros) elif self.name == pl_expr.TemporalFunction.Nanosecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") - nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) + nanos = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.NANOSECOND + ) millis_as_nanos = plc.binaryop.binary_operation( millis, plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 76e47b50c3b..ce825c7647b 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "rapids-dask-dependency==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 2b208e2e021..5f9a04d3cee 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -41,6 +41,9 @@ set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(CUDF_EXPORT_NVCOMP OFF) +endif() set(CUDA_STATIC_RUNTIME ON) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) diff --git a/python/pylibcudf/LICENSE b/python/pylibcudf/LICENSE new file mode 120000 index 00000000000..30cff7403da --- /dev/null +++ b/python/pylibcudf/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a7cb66d7b16..1d72eacac12 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -66,3 +66,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow) add_subdirectory(libcudf) add_subdirectory(strings) add_subdirectory(io) +add_subdirectory(nvtext) diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index a384edd456d..b98b37fe0fd 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -78,4 +79,5 @@ __all__ = [ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 2a5365e8fad..304f27be340 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -28,6 +28,7 @@ lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -92,4 +93,5 @@ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd index 2fce48cf1b4..72ce680ba7a 100644 --- a/python/pylibcudf/pylibcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/datetime.pxd @@ -1,8 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.libcudf.datetime cimport datetime_component + from .column cimport Column cpdef Column extract_year( Column col ) + +cpdef Column extract_datetime_component( + Column col, + datetime_component component +) diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index e8e0caaf42d..784d29128bf 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -3,19 +3,14 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.datetime cimport ( - day_of_year as cpp_day_of_year, - extract_day as cpp_extract_day, - extract_hour as cpp_extract_hour, - extract_microsecond_fraction as cpp_extract_microsecond_fraction, - extract_millisecond_fraction as cpp_extract_millisecond_fraction, - extract_minute as cpp_extract_minute, - extract_month as cpp_extract_month, - extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, - extract_second as cpp_extract_second, - extract_weekday as cpp_extract_weekday, + datetime_component, + extract_datetime_component as cpp_extract_datetime_component, extract_year as cpp_extract_year, ) +from pylibcudf.libcudf.datetime import \ + datetime_component as DatetimeComponent # no-cython-lint + from .column cimport Column @@ -41,41 +36,29 @@ cpdef Column extract_year( result = move(cpp_extract_year(values.view())) return Column.from_libcudf(move(result)) +cpdef Column extract_datetime_component( + Column values, + datetime_component component +): + """ + Extract a datetime component from a datetime column. -def extract_datetime_component(Column col, str field): + For details, see :cpp:func:`cudf::extract_datetime_component`. - cdef unique_ptr[column] c_result + Parameters + ---------- + values : Column + The column to extract the component from. + component : DatetimeComponent + The datetime component to extract. - with nogil: - if field == "year": - c_result = move(cpp_extract_year(col.view())) - elif field == "month": - c_result = move(cpp_extract_month(col.view())) - elif field == "day": - c_result = move(cpp_extract_day(col.view())) - elif field == "weekday": - c_result = move(cpp_extract_weekday(col.view())) - elif field == "hour": - c_result = move(cpp_extract_hour(col.view())) - elif field == "minute": - c_result = move(cpp_extract_minute(col.view())) - elif field == "second": - c_result = move(cpp_extract_second(col.view())) - elif field == "millisecond": - c_result = move( - cpp_extract_millisecond_fraction(col.view()) - ) - elif field == "microsecond": - c_result = move( - cpp_extract_microsecond_fraction(col.view()) - ) - elif field == "nanosecond": - c_result = move( - cpp_extract_nanosecond_fraction(col.view()) - ) - elif field == "day_of_year": - c_result = move(cpp_day_of_year(col.view())) - else: - raise ValueError(f"Invalid datetime field: '{field}'") + Returns + ------- + Column + Column with the extracted component. + """ + cdef unique_ptr[column] result - return Column.from_libcudf(move(c_result)) + with nogil: + result = move(cpp_extract_datetime_component(values.view(), component)) + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 2167616690f..15beaee47d4 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx - replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx +set(cython_sources + aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx labeling.pyx reduce.pyx + replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd index a4465343197..73cdfb96af5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint8_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -7,6 +8,18 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: + cpdef enum class datetime_component(uint8_t): + YEAR + MONTH + DAY + WEEKDAY + HOUR + MINUTE + SECOND + MILLISECOND + MICROSECOND + NANOSECOND + cdef unique_ptr[column] extract_year(const column_view& column) except + cdef unique_ptr[column] extract_month(const column_view& column) except + cdef unique_ptr[column] extract_day(const column_view& column) except + @@ -23,6 +36,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_nanosecond_fraction( const column_view& column ) except + + cdef unique_ptr[column] extract_datetime_component( + const column_view& column, + datetime_component component + ) except + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": DAY "cudf::datetime::rounding_frequency::DAY" diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pyx b/python/pylibcudf/pylibcudf/libcudf/datetime.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 83a9573baad..e6688cfff81 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( - column_view input_col, + column_view input, string_scalar true_string) except + cdef unique_ptr[column] from_booleans( - column_view input_col, + column_view booleans, string_scalar true_string, string_scalar false_string) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index fa8975c4df9..fceddd58df0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( - column_view input_col, + column_view input, data_type timestamp_type, string format) except + cdef unique_ptr[column] from_timestamps( - column_view input_col, + column_view timestamps, string format, - column_view input_strings_names) except + + column_view names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd index ebe10574353..43ffad1d89f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd @@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_durations( - const column_view & strings_col, + const column_view & input, data_type duration_type, const string & format) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd index 6f820f3c9a4..72ab329f2dd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd @@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_fixed_point( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] from_fixed_point( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] is_fixed_point( - column_view source_strings, - data_type output_type + column_view input, + data_type decimal_type ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd index fe571cfced6..801db438e92 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd @@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] ipv4_to_integers( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] integers_to_ipv4( - column_view input_col) except + + column_view integers) except + cdef unique_ptr[column] is_ipv4( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index e0a8b776465..0d286c36446 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -11,3 +11,7 @@ cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( column_view input, regex_program prog) except + + + cdef unique_ptr[column] find_re( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd index 657fe61eb14..875f8cafd14 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd @@ -12,11 +12,11 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] pad( - column_view source_strings, + column_view input, size_type width, side_type side, string fill_char) except + cdef unique_ptr[column] zfill( - column_view source_strings, + column_view input, size_type width) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 019ff3f17ba..e92c5dc1d66 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,12 +1,10 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. -from libc.stdint cimport int32_t +from libcpp cimport int cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - cpdef enum class side_type(int32_t): - LEFT 'cudf::strings::side_type::LEFT' - RIGHT 'cudf::strings::side_type::RIGHT' - BOTH 'cudf::strings::side_type::BOTH' - -ctypedef int32_t underlying_type_t_side_type + cpdef enum class side_type(int): + LEFT + RIGHT + BOTH diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd index b0ca771762d..dd527a78e7f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd @@ -10,6 +10,6 @@ from pylibcudf.libcudf.strings.side_type cimport side_type cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] strip( - column_view source_strings, - side_type stype, + column_view input, + side_type side, string_scalar to_strip) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd index c0053391328..abc1bd43ad2 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd @@ -9,5 +9,5 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] wrap( - column_view source_strings, + column_view input, size_type width) except + diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt new file mode 100644 index 00000000000..ebe1fda1f12 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources edit_distance.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd new file mode 100644 index 00000000000..82f7c425b1d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport edit_distance + +__all__ = [ + "edit_distance", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py new file mode 100644 index 00000000000..986652a241f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import edit_distance + +__all__ = [ + "edit_distance", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd new file mode 100644 index 00000000000..446b95afabb --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column edit_distance(Column input, Column targets) + +cpdef Column edit_distance_matrix(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx new file mode 100644 index 00000000000..fc98ccbc50c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.edit_distance cimport ( + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix, +) + + +cpdef Column edit_distance(Column input, Column targets): + """ + Returns the edit distance between individual strings in two strings columns + + For details, see :cpp:func:`edit_distance` + + Parameters + ---------- + input : Column + Input strings + targets : Column + Strings to compute edit distance against + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef column_view c_targets = targets.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance(c_strings, c_targets)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column edit_distance_matrix(Column input): + """ + Returns the edit distance between all strings in the input strings column + + For details, see :cpp:func:`edit_distance_matrix` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance_matrix(c_strings)) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index d92f806efbe..eeb44d19333 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -22,6 +22,7 @@ set(cython_sources find.pyx find_multiple.pyx findall.pyx + padding.pyx regex_flags.pyx regex_program.pyx repeat.pyx @@ -30,6 +31,7 @@ set(cython_sources slice.pyx strip.pyx translate.pyx + wrap.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 788e2c99ab1..187ef113073 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -11,13 +11,16 @@ from . cimport ( find, find_multiple, findall, + padding, regex_flags, regex_program, replace, + side_type, slice, split, strip, translate, + wrap, ) from .side_type cimport side_type @@ -39,4 +42,5 @@ __all__ = [ "split", "side_type", "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index bcaeb073d0b..6033cea0625 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -11,14 +11,17 @@ find, find_multiple, findall, + padding, regex_flags, regex_program, repeat, replace, + side_type, slice, split, strip, translate, + wrap, ) from .side_type import SideType @@ -40,4 +43,5 @@ "split", "SideType", "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..eb0d6ee6999 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx + convert_fixed_point.pyx convert_ipv4.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..431beed8e5d 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_ipv4, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..a601b562c2e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_ipv4, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd new file mode 100644 index 00000000000..312ac3c0ca0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column to_booleans(Column input, Scalar true_string) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx new file mode 100644 index 00000000000..0c10f821ab6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.convert cimport ( + convert_booleans as cpp_convert_booleans, +) +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column to_booleans(Column input, Scalar true_string): + """ + Returns a new bool column by parsing boolean values from the strings + in the provided strings column. + + For details, see :cpp:func:`cudf::strings::to_booleans`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + true_string : Scalar + String to expect for true. Non-matching strings are false + + Returns + ------- + Column + New bool column converted from strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): + """ + Returns a new strings column converting the boolean values from the + provided column into strings. + + For details, see :cpp:func:`cudf::strings::from_booleans`. + + Parameters + ---------- + booleans : Column + Boolean column to convert. + + true_string : Scalar + String to use for true in the output column. + + false_string : Scalar + String to use for false in the output column. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + cdef const string_scalar* c_false_string = ( + false_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index 07c84d263d6..80ec168644b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ) + +cpdef Column is_timestamp( + Column input, + str format, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index fcacb096f87..0ee60812e00 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -15,28 +15,74 @@ from pylibcudf.types import DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ): + """ + Returns a new timestamp column converting a strings column into + timestamps using the provided format pattern. + + For details, see cpp:`cudf::strings::to_timestamps`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + timestamp_type : DataType + The timestamp type used for creating the output column. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New datetime column + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ): + """ + Returns a new strings column converting a timestamp column into + strings using the provided format pattern. + + For details, see cpp:`cudf::strings::from_timestamps`. + + Parameters + ---------- + timestamps : Column + Timestamp values to convert + + format : str + The string specifying output format. + + input_strings_names : Column + The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + + Returns + ------- + Column + New strings column with formatted timestamps. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.from_timestamps( - input.view(), - format, + timestamps.view(), + c_format, input_strings_names.view() ) @@ -44,13 +90,33 @@ cpdef Column from_timestamps( cpdef Column is_timestamp( Column input, - const string& format + str format ): + """ + Verifies the given strings column can be parsed to timestamps + using the provided format pattern. + + For details, see cpp:`cudf::strings::is_timestamp`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New bool column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - format + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd index ac11b8959ed..eecdade4ef9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=* ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index f3e0b7c9c8e..76c5809c3d5 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -15,27 +15,80 @@ from pylibcudf.types import DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ): + """ + Returns a new duration column converting a strings column into + durations using the provided format pattern. + + For details, see cpp:func:`cudf::strings::to_durations` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + duration_type : DataType + The duration type used for creating the output column. + + format : str + String specifying the duration format in strings. + + Returns + ------- + Column + New duration column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() + with nogil: - c_result = cpp_convert_durations.to_durations( - input.view(), - duration_type.c_obj, - format + c_result = move( + cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + c_format + ) ) return Column.from_libcudf(move(c_result)) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=None ): + """ + Returns a new strings column converting a duration column into + strings using the provided format pattern. + + For details, see cpp:func:`cudf::strings::from_durations` + + Parameters + ---------- + durations : Column + Duration values to convert. + + format : str + The string specifying output format. + Default format is "%D days %H:%M:%S". + + Returns + ------- + Column + New strings column with formatted durations. + """ cdef unique_ptr[column] c_result + + if format is None: + format = "%D days %H:%M:%S" + cdef string c_format = format.encode() + with nogil: - c_result = cpp_convert_durations.from_durations( - input.view(), - format + c_result = move( + cpp_convert_durations.from_durations( + durations.view(), + c_format + ) ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd new file mode 100644 index 00000000000..049b9b3fffe --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_fixed_point(Column input, DataType output_type) + +cpdef Column from_fixed_point(Column input) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx new file mode 100644 index 00000000000..40dadf6f967 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_fixed_point as cpp_fixed_point, +) +from pylibcudf.types cimport DataType, type_id + + +cpdef Column to_fixed_point(Column input, DataType output_type): + """ + Returns a new fixed-point column parsing decimal values from the + provided strings column. + + For details, see :cpp:details:`cudf::strings::to_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of fixed-point column to return including the scale value. + + Returns + ------- + Column + New column of output_type. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.to_fixed_point( + input.view(), + output_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_fixed_point(Column input): + """ + Returns a new strings column converting the fixed-point values + into a strings column. + + For details, see :cpp:details:`cudf::strings::from_fixed_point` + + Parameters + ---------- + input : Column + Fixed-point column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.from_fixed_point( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to fixed-point. + + For details, see :cpp:details:`cudf::strings::is_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + decimal_type : DataType + Fixed-point type (with scale) used only for checking overflow. + Defaults to Decimal64 + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if decimal_type is None: + decimal_type = DataType(type_id.DECIMAL64) + + with nogil: + c_result = move( + cpp_fixed_point.is_fixed_point( + input.view(), + decimal_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd new file mode 100644 index 00000000000..c61f5c0bdca --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column ipv4_to_integers(Column input) + +cpdef Column integers_to_ipv4(Column integers) + +cpdef Column is_ipv4(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx new file mode 100644 index 00000000000..f2a980d4269 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 + + +cpdef Column ipv4_to_integers(Column input): + """ + Converts IPv4 addresses into integers. + + For details, see cpp:func:`cudf::strings::ipv4_to_integers` + + Parameters + ---------- + input : Column + Strings instance for this operation + + Returns + ------- + Column + New uint32 column converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.ipv4_to_integers( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_ipv4(Column integers): + """ + Converts integers into IPv4 addresses as strings. + + For details, see cpp:func:`cudf::strings::integers_to_ipv4` + + Parameters + ---------- + integers : Column + Integer (uint32) column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.integers_to_ipv4( + integers.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_ipv4(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from IPv4 format. + + For details, see cpp:func:`cudf::strings::is_ipv4` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_ipv4.is_ipv4( + input.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd index 54afa088141..3c35a9c9aa9 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -4,4 +4,5 @@ from pylibcudf.column cimport Column from pylibcudf.strings.regex_program cimport RegexProgram +cpdef Column find_re(Column input, RegexProgram pattern) cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 3a6b87504b3..5212dc4594d 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -38,3 +38,35 @@ cpdef Column findall(Column input, RegexProgram pattern): ) return Column.from_libcudf(move(c_result)) + + +cpdef Column find_re(Column input, RegexProgram pattern): + """ + Returns character positions where the pattern first matches + the elements in input strings. + + For details, see :cpp:func:`cudf::strings::find_re` + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New column of integers + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_findall.find_re( + input.view(), + pattern.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/strings/padding.pxd new file mode 100644 index 00000000000..a035a5ad187 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.side_type cimport side_type +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char) + +cpdef Column zfill(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx new file mode 100644 index 00000000000..24daaaa3838 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -0,0 +1,75 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport padding as cpp_padding +from pylibcudf.libcudf.strings.side_type cimport side_type + + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char): + """ + Add padding to each string using a provided character. + + For details, see :cpp:func:`cudf::strings::pad`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + side : SideType + Where to place the padding characters. + fill_char : str + Single UTF-8 character to use for padding + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef string c_fill_char = fill_char.encode("utf-8") + + with nogil: + c_result = move( + cpp_padding.pad( + input.view(), + width, + side, + c_fill_char, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column zfill(Column input, size_type width): + """ + Add '0' as padding to the left of each string. + + For details, see :cpp:func:`cudf::strings::zfill`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + + Returns + ------- + Column + New column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_padding.zfill( + input.view(), + width, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd index 34b7a580380..34b03e9bc27 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -1,3 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index acdc7d6ff1f..cf0c770cc11 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,4 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/strings/wrap.pxd new file mode 100644 index 00000000000..fcc86650acf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column wrap(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx new file mode 100644 index 00000000000..11e31f54eee --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport wrap as cpp_wrap +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column wrap(Column input, size_type width): + """ + Wraps strings onto multiple lines shorter than `width` by + replacing appropriate white space with + new-line characters (ASCII 0x0A). + + For details, see :cpp:func:`cudf::strings::wrap`. + + Parameters + ---------- + input : Column + String column + + width : int + Maximum character width of a line within each string + + Returns + ------- + Column + Column of wrapped strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_wrap.wrap( + input.view(), + width, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index 89c96829e71..75930d59058 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,7 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import datetime -import functools import pyarrow as pa import pyarrow.compute as pc @@ -10,19 +9,6 @@ from utils import assert_column_eq -@pytest.fixture -def date_column(has_nulls): - values = [ - datetime.date(1999, 1, 1), - datetime.date(2024, 10, 12), - datetime.date(1, 1, 1), - datetime.date(9999, 1, 1), - ] - if has_nulls: - values[2] = None - return plc.interop.from_arrow(pa.array(values, type=pa.date32())) - - @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) def datetime_column(has_nulls, request): values = [ @@ -40,24 +26,35 @@ def datetime_column(has_nulls, request): ) -@pytest.mark.parametrize( - "component, pc_fun", - [ - ("year", pc.year), - ("month", pc.month), - ("day", pc.day), - ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), - ("hour", pc.hour), - ("minute", pc.minute), - ("second", pc.second), - ("millisecond", pc.millisecond), - ("microsecond", pc.microsecond), - ("nanosecond", pc.nanosecond), +@pytest.fixture( + params=[ + ("year", plc.datetime.DatetimeComponent.YEAR), + ("month", plc.datetime.DatetimeComponent.MONTH), + ("day", plc.datetime.DatetimeComponent.DAY), + ("day_of_week", plc.datetime.DatetimeComponent.WEEKDAY), + ("hour", plc.datetime.DatetimeComponent.HOUR), + ("minute", plc.datetime.DatetimeComponent.MINUTE), + ("second", plc.datetime.DatetimeComponent.SECOND), + ("millisecond", plc.datetime.DatetimeComponent.MILLISECOND), + ("microsecond", plc.datetime.DatetimeComponent.MICROSECOND), + ("nanosecond", plc.datetime.DatetimeComponent.NANOSECOND), ], + ids=lambda x: x[0], ) -def test_extraction(datetime_column, component, pc_fun): +def component(request): + return request.param + + +def test_extract_datetime_component(datetime_column, component): + attr, component = component + kwargs = {} + if attr == "day_of_week": + kwargs = {"count_from_zero": False} got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) + + expect = getattr(pc, attr)( + plc.interop.to_arrow(datetime_column), **kwargs + ).cast(pa.int16()) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py new file mode 100644 index 00000000000..7d93c471cc4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def edit_distance_data(): + arr1 = ["hallo", "goodbye", "world"] + arr2 = ["hello", "", "world"] + return pa.array(arr1), pa.array(arr2) + + +def test_edit_distance(edit_distance_data): + input_col, targets = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + ) + expected = pa.array([1, 7, 0], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_edit_distance_matrix(edit_distance_data): + input_col, _ = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance_matrix( + plc.interop.from_arrow(input_col) + ) + expected = pa.array( + [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32()) + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index e9e95459d0e..69f7a0fdd33 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -1,7 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from datetime import datetime - import pyarrow as pa import pylibcudf as plc import pytest @@ -21,39 +19,16 @@ def timestamp_type(request): return request.param -@pytest.fixture( - scope="module", - params=[ - pa.duration("ns"), - pa.duration("us"), - pa.duration("ms"), - pa.duration("s"), - ], -) -def duration_type(request): - return request.param - - @pytest.fixture(scope="module") def pa_timestamp_col(): return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) -@pytest.fixture(scope="module") -def pa_duration_col(): - return pa.array(["05:20:25"]) - - @pytest.fixture(scope="module") def plc_timestamp_col(pa_timestamp_col): return plc.interop.from_arrow(pa_timestamp_col) -@pytest.fixture(scope="module") -def plc_duration_col(pa_duration_col): - return plc.interop.from_arrow(pa_duration_col) - - @pytest.mark.parametrize("format", ["%Y-%m-%d"]) def test_to_datetime( pa_timestamp_col, plc_timestamp_col, timestamp_type, format @@ -62,24 +37,6 @@ def test_to_datetime( got = plc.strings.convert.convert_datetime.to_timestamps( plc_timestamp_col, plc.interop.from_arrow(timestamp_type), - format.encode(), - ) - assert_column_eq(expect, got) - - -@pytest.mark.parametrize("format", ["%H:%M:%S"]) -def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): - def to_timedelta(duration_str): - date = datetime.strptime(duration_str, format) - return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date - - expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( - duration_type - ) - - got = plc.strings.convert.convert_durations.to_durations( - plc_duration_col, - plc.interop.from_arrow(duration_type), - format.encode(), + format, ) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py new file mode 100644 index 00000000000..117c59ff1b8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_booleans(): + pa_array = pa.array(["true", None, "True"]) + result = plc.strings.convert.convert_booleans.to_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("True")), + ) + expected = pa.array([False, None, True]) + assert_column_eq(result, expected) + + +def test_from_booleans(): + pa_array = pa.array([True, None, False]) + result = plc.strings.convert.convert_booleans.from_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("A")), + plc.interop.from_arrow(pa.scalar("B")), + ) + expected = pa.array(["A", None, "B"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py new file mode 100644 index 00000000000..f3e84286a36 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import datetime + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def fmt(): + return "%Y-%m-%dT%H:%M:%S" + + +def test_to_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None]) + result = plc.strings.convert.convert_datetime.to_timestamps( + plc.interop.from_arrow(arr), + plc.DataType(plc.TypeId.TIMESTAMP_SECONDS), + fmt, + ) + expected = pc.strptime(arr, fmt, "s") + assert_column_eq(result, expected) + + +def test_from_timestamp(fmt): + arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None]) + result = plc.strings.convert.convert_datetime.from_timestamps( + plc.interop.from_arrow(arr), + fmt, + plc.interop.from_arrow(pa.array([], type=pa.string())), + ) + # pc.strftime will add the extra %f + expected = pa.array(["2020-01-01T01:01:01", None]) + assert_column_eq(result, expected) + + +def test_is_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"]) + result = plc.strings.convert.convert_datetime.is_timestamp( + plc.interop.from_arrow(arr), + fmt, + ) + expected = pa.array([True, None, False]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py new file mode 100644 index 00000000000..6d704309bfd --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime, timedelta + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture( + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +def test_to_duration(pa_duration_col, plc_duration_col, duration_type): + format = "%H:%M:%S" + + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format, + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"]) +def test_from_durations(format): + pa_array = pa.array( + [timedelta(days=1, hours=1, minutes=1, seconds=1), None] + ) + result = plc.strings.convert.convert_durations.from_durations( + plc.interop.from_arrow(pa_array), format + ) + expected = pa.array(["1 days 01:01:01", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py new file mode 100644 index 00000000000..b1c4d729604 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import decimal + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_fixed_point(): + typ = pa.decimal128(38, 2) + arr = pa.array(["123", "1.23", None]) + result = plc.strings.convert.convert_fixed_point.to_fixed_point( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_fixed_point(): + arr = pa.array([decimal.Decimal("1.1"), None]) + result = plc.strings.convert.convert_fixed_point.from_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["1.1", None]) + assert_column_eq(result, expected) + + +def test_is_fixed_point(): + arr = pa.array(["123", "1.23", "1.2.3", "", None]) + result = plc.strings.convert.convert_fixed_point.is_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py new file mode 100644 index 00000000000..4dc3e512624 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_ipv4_to_integers(): + arr = pa.array(["123.45.67.890", None]) + result = plc.strings.convert.convert_ipv4.ipv4_to_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array([2066564730, None], type=pa.uint32()) + assert_column_eq(result, expected) + + +def test_integers_to_ipv4(): + arr = pa.array([1, 0, None], type=pa.uint32()) + result = plc.strings.convert.convert_ipv4.integers_to_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["0.0.0.1", "0.0.0.0", None]) + assert_column_eq(result, expected) + + +def test_is_ipv4(): + arr = pa.array(["0.0.0.1", "1.2.34", "A", None]) + result = plc.strings.convert.convert_ipv4.is_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py index 994552fa276..debfad92d00 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -21,3 +21,20 @@ def test_findall(): type=pa_result.type, ) assert_column_eq(result, expected) + + +def test_find_re(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[eb]" + result = plc.strings.findall.find_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [0, 2, 3, -1], + type=pa_result.type, + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py new file mode 100644 index 00000000000..2ba775d17ae --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_pad(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.pad( + plc.interop.from_arrow(arr), + 2, + plc.strings.side_type.SideType.LEFT, + "!", + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="!")) + assert result.equals(expected) + + +def test_zfill(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.zfill(plc.interop.from_arrow(arr), 2) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="0")) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py new file mode 100644 index 00000000000..a1c820cd586 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import textwrap + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_wrap(): + width = 12 + pa_array = pa.array( + [ + "the quick brown fox jumped over the lazy brown dog", + "hello, world", + None, + ] + ) + result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width) + expected = pa.array( + [ + textwrap.fill(val, width) if isinstance(val, str) else val + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result)