diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 55399d0371a..f5bcdc62604 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -24,14 +24,17 @@ rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -# Download the pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep +# Download libcudf and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep -rapids-logger "Install pylibcudf" -python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl +rapids-logger "Install libcudf, pylibcudf and cudf_polars" +python -m pip install \ + -v \ + "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ + "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" -rapids-logger "Install cudf_polars" -python -m pip install $(echo ./dist/cudf_polars*.whl) TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') rapids-logger "Clone polars to ${TAG}" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 136f43ee706..f7a5dd2f2fb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) +option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) option( diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 110b4557840..b8a53cd8bd9 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 65a44532cf1..f44f26e4d2c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,16 @@ */ #include -#include -#include #include #include +#include + #include +#include + #include #include #include @@ -35,13 +37,10 @@ enum class TreeType { }; template -class AST : public cudf::benchmark {}; - -template -static void BM_ast_transform(benchmark::State& state) +static void BM_ast_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size = static_cast(state.get_int64("table_size")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state) auto const& expression_tree_root = expressions.back(); - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::compute_column(table, expression_tree_root); - } - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); -} + state.add_global_memory_reads(table_size * (tree_levels + 1)); -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) \ + static void name(::nvbench::state& st) \ { \ - BM_ast_transform(st); \ + ::BM_ast_transform(st); \ } \ - BENCHMARK_REGISTER_F(AST, name) \ - ->Apply(CustomRanges) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("tree_levels", {1, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index fa98d9e601a..7d267a88764 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,14 @@ */ #include -#include -#include #include #include #include +#include + #include -#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -33,13 +32,10 @@ enum class TreeType { }; template -class BINARYOP : public cudf::benchmark {}; - -template -static void BM_binaryop_transform(benchmark::State& state) +static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size{static_cast(state.get_int64("table_size"))}; + auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state) cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); cudf::table_view table{*source_table}; - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + // Use the number of bytes read from global memory + state.add_global_memory_reads(table_size * (tree_levels + 1)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) auto const op = cudf::binary_operator::ADD; auto const result_data_type = cudf::data_type(cudf::type_to_id()); @@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state) result = cudf::binary_operation(result->view(), col, op, result_data_type); }); } - } - - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); + }); } #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \ - (::benchmark::State & st) { BM_binaryop_transform(st); } + \ + static void name(::nvbench::state& st) \ + { \ + BM_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false); - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 2, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 7086a61c7c5..bc0ff69bce9 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -15,20 +15,18 @@ */ #include -#include -#include #include -class COMPILED_BINARYOP : public cudf::benchmark {}; +#include template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const column_size{static_cast(state.range(0))}; + auto const table_size = static_cast(state.get_int64("table_size")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{column_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{table_size}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) // Call once for hot cache. cudf::binary_operation(lhs, rhs, binop, output_dtype); - for (auto _ : state) { - cuda_event_timer timer(state, true); - cudf::binary_operation(lhs, rhs, binop, output_dtype); - } - // use number of bytes read and written to global memory - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut))); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); } +#define BM_STRINGIFY(a) #a + // TODO tparam boolean for null. -#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ - BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::bop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ - ->Arg(100000000); /* 100M */ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_compiled_binaryop(st, ::cudf::binary_operator::bop); \ + } \ + NVBENCH_BENCH(name) \ + .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ + .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 1b6a1730161..33b1b45fb44 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -16,7 +16,11 @@ function(find_and_configure_nvcomp) include(${rapids-cmake-dir}/cpm/nvcomp.cmake) - rapids_cpm_nvcomp(USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) + set(export_args) + if(CUDF_EXPORT_NVCOMP) + set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + endif() + rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) # Per-thread default stream if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp index 8c1c3c28df8..e7643eb44bd 100644 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ b/cpp/include/cudf/detail/utilities/logger.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ #include // Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__) +#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index b12fbe39a57..dc14802adc1 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -86,14 +86,28 @@ class datasource { /** * @brief Creates a source from a file path. * + * @note Parameters `offset`, `max_size_estimate` and `min_size_estimate` are hints to the + * `datasource` implementation about the expected range of the data that will be read. The + * implementation may use these hints to optimize the read operation. These parameters are usually + * based on the byte range option. In this case, `min_size_estimate` should be no greater than the + * byte range to avoid potential issues when reading adjacent ranges. `max_size_estimate` can + * include padding after the byte range, to include additional data that may be needed for + * processing. + * + @throws cudf::logic_error if the minimum size estimate is greater than the maximum size estimate + * * @param[in] filepath Path to the file to use - * @param[in] offset Bytes from the start of the file (the default is zero) - * @param[in] size Bytes from the offset; use zero for entire file (the default is zero) + * @param[in] offset Starting byte offset from which data will be read (the default is zero) + * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) + * @param[in] min_size_estimate Lower estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) * @return Constructed datasource object */ static std::unique_ptr create(std::string const& filepath, - size_t offset = 0, - size_t size = 0); + size_t offset = 0, + size_t max_size_estimate = 0, + size_t min_size_estimate = 0); /** * @brief Creates a source from a host memory buffer. diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp index 45d5d1b12e1..982554a23f5 100644 --- a/cpp/include/cudf/utilities/logger.hpp +++ b/cpp/include/cudf/utilities/logger.hpp @@ -22,6 +22,10 @@ namespace CUDF_EXPORT cudf { +namespace detail { +spdlog::logger& logger(); +} + /** * @brief Returns the global logger. * @@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf { * * @return spdlog::logger& The logger. */ -spdlog::logger& logger(); +[[deprecated( + "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& +logger(); } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp index 723ba310a1e..dca590baebf 100644 --- a/cpp/include/nvtext/edit_distance.hpp +++ b/cpp/include/nvtext/edit_distance.hpp @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext { * @param targets Strings to compute edit distance against `input` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New lists column of edit distance values */ std::unique_ptr edit_distance( cudf::strings_column_view const& input, diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index de8eea9e99b..5a060902eb2 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -122,14 +122,16 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { std::vector> make_datasources(source_info const& info, - size_t range_offset = 0, - size_t range_size = 0) + size_t offset = 0, + size_t max_size_estimate = 0, + size_t min_size_estimate = 0) { switch (info.type()) { case io_type::FILEPATH: { auto sources = std::vector>(); for (auto const& filepath : info.filepaths()) { - sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + sources.emplace_back( + cudf::io::datasource::create(filepath, offset, max_size_estimate, min_size_estimate)); } return sources; } @@ -211,7 +213,8 @@ table_with_metadata read_json(json_reader_options options, auto datasources = make_datasources(options.get_source(), options.get_byte_range_offset(), - options.get_byte_range_size_with_padding()); + options.get_byte_range_size_with_padding(), + options.get_byte_range_size()); return json::detail::read_json(datasources, options, stream, mr); } @@ -238,7 +241,8 @@ table_with_metadata read_csv(csv_reader_options options, auto datasources = make_datasources(options.get_source(), options.get_byte_range_offset(), - options.get_byte_range_size_with_padding()); + options.get_byte_range_size_with_padding(), + options.get_byte_range_size()); CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported."); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e4313eba454..0be976b6144 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -32,6 +32,7 @@ #include #include +#include namespace cudf { namespace io { @@ -54,6 +55,30 @@ class file_source : public datasource { } } + std::unique_ptr host_read(size_t offset, size_t size) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + ssize_t const read_size = std::min(size, _file.size() - offset); + + std::vector v(read_size); + CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); + return buffer::create(std::move(v)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + auto const read_size = std::min(size, _file.size() - offset); + + CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + "read failed"); + return read_size; + } + ~file_source() override = default; [[nodiscard]] bool supports_device_read() const override @@ -138,40 +163,63 @@ class file_source : public datasource { */ class memory_mapped_source : public file_source { public: - explicit memory_mapped_source(char const* filepath, size_t offset, size_t size) + explicit memory_mapped_source(char const* filepath, + size_t offset, + size_t max_size_estimate, + size_t min_size_estimate) : file_source(filepath) { if (_file.size() != 0) { - map(_file.desc(), offset, size); - register_mmap_buffer(); + // Memory mapping is not exclusive, so we can include the whole region we expect to read + map(_file.desc(), offset, max_size_estimate); + // Buffer registration is exclusive (can't overlap with other registered buffers) so we + // register the lower estimate; this avoids issues when reading adjacent ranges from the same + // file from multiple threads + register_mmap_buffer(offset, min_size_estimate); } } ~memory_mapped_source() override { if (_map_addr != nullptr) { - munmap(_map_addr, _map_size); + unmap(); unregister_mmap_buffer(); } } std::unique_ptr host_read(size_t offset, size_t size) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); + + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size); + } - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is only partially within the registered region, copy to a new + // host buffer to make the data safe to copy to the device + if (_reg_addr != nullptr and + (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) { + auto const src = static_cast(_map_addr) + (offset - _map_offset); + + return std::make_unique>>( + std::vector(src, src + read_size)); + } return std::make_unique( - static_cast(_map_addr) + (offset - _map_offset), read_size); + static_cast(_map_addr) + offset - _map_offset, read_size); } size_t host_read(size_t offset, size_t size, uint8_t* dst) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size, dst); + } auto const src = static_cast(_map_addr) + (offset - _map_offset); std::memcpy(dst, src, read_size); @@ -184,16 +232,18 @@ class memory_mapped_source : public file_source { * * Fixes nvbugs/4215160 */ - void register_mmap_buffer() + void register_mmap_buffer(size_t offset, size_t size) { - if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) { - return; - } + if (_map_addr == nullptr or not pageableMemoryAccessUsesHostPageTables()) { return; } - auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault); - if (result == cudaSuccess) { - _is_map_registered = true; - } else { + // Registered region must be within the mapped region + _reg_offset = std::max(offset, _map_offset); + _reg_size = std::min(size != 0 ? size : _map_size, (_map_offset + _map_size) - _reg_offset); + + _reg_addr = static_cast(_map_addr) - _map_offset + _reg_offset; + auto const result = cudaHostRegister(_reg_addr, _reg_size, cudaHostRegisterReadOnly); + if (result != cudaSuccess) { + _reg_addr = nullptr; CUDF_LOG_WARN("cudaHostRegister failed with {} ({})", static_cast(result), cudaGetErrorString(result)); @@ -205,10 +255,12 @@ class memory_mapped_source : public file_source { */ void unregister_mmap_buffer() { - if (not _is_map_registered) { return; } + if (_reg_addr == nullptr) { return; } - auto const result = cudaHostUnregister(_map_addr); - if (result != cudaSuccess) { + auto const result = cudaHostUnregister(_reg_addr); + if (result == cudaSuccess) { + _reg_addr = nullptr; + } else { CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})", static_cast(result), cudaGetErrorString(result)); @@ -226,52 +278,30 @@ class memory_mapped_source : public file_source { // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); + if (_map_size == 0) { return; } // Check if accessing a region within already mapped area _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); } - private: - size_t _map_size = 0; - size_t _map_offset = 0; - void* _map_addr = nullptr; - bool _is_map_registered = false; -}; - -/** - * @brief Implementation class for reading from a file using `read` calls - * - * Potentially faster than `memory_mapped_source` when only a small portion of the file is read - * through the host. - */ -class direct_read_source : public file_source { - public: - explicit direct_read_source(char const* filepath) : file_source(filepath) {} - - std::unique_ptr host_read(size_t offset, size_t size) override + void unmap() { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - ssize_t const read_size = std::min(size, _file.size() - offset); - - std::vector v(read_size); - CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); - return buffer::create(std::move(v)); + if (_map_addr != nullptr) { + auto const result = munmap(_map_addr, _map_size); + if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); } + _map_addr = nullptr; + } } - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - auto const read_size = std::min(size, _file.size() - offset); + private: + size_t _map_offset = 0; + size_t _map_size = 0; + void* _map_addr = nullptr; - CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), - "read failed"); - return read_size; - } + size_t _reg_offset = 0; + size_t _reg_size = 0; + void* _reg_addr = nullptr; }; /** @@ -431,16 +461,21 @@ class user_datasource_wrapper : public datasource { std::unique_ptr datasource::create(std::string const& filepath, size_t offset, - size_t size) + size_t max_size_estimate, + size_t min_size_estimate) { + CUDF_EXPECTS(max_size_estimate == 0 or min_size_estimate <= max_size_estimate, + "Invalid min/max size estimates for datasource creation"); + #ifdef CUFILE_FOUND if (cufile_integration::is_always_enabled()) { // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); + return std::make_unique(filepath.c_str()); } #endif // Use our own memory mapping implementation for direct file reads - return std::make_unique(filepath.c_str(), offset, size); + return std::make_unique( + filepath.c_str(), offset, max_size_estimate, min_size_estimate); } std::unique_ptr datasource::create(host_buffer const& buffer) diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp index d54f5677c4c..e52fffbd8c6 100644 --- a/cpp/src/utilities/logger.cpp +++ b/cpp/src/utilities/logger.cpp @@ -74,8 +74,10 @@ struct logger_wrapper { } // namespace -spdlog::logger& cudf::logger() +spdlog::logger& cudf::detail::logger() { static logger_wrapper wrapped{}; return wrapped.logger_; } + +spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index dc14824d834..0028dd946e3 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -2516,4 +2516,39 @@ TEST_F(CsvReaderTest, UTF8BOM) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected); } +void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs) +{ + ASSERT_EQ(lhs->size(), rhs->size()); + EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size())); +} + +TEST_F(CsvReaderTest, OutOfMapBoundsReads) +{ + // write a lot of data into a file + auto filepath = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv"; + auto const num_rows = 1 << 20; + auto const row = std::string{"0,1,2,3,4,5,6,7,8,9\n"}; + auto const file_size = num_rows * row.size(); + { + std::ofstream outfile(filepath, std::ofstream::out); + for (size_t i = 0; i < num_rows; ++i) { + outfile << row; + } + } + + // Only memory map the middle of the file + auto source = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4); + auto full_source = cudf::io::datasource::create(filepath); + auto const all_data = source->host_read(0, file_size); + auto ref_data = full_source->host_read(0, file_size); + expect_buffers_equal(ref_data.get(), all_data.get()); + + auto const start_data = source->host_read(file_size / 2, file_size / 2); + expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(), + start_data.get()); + + auto const end_data = source->host_read(0, file_size / 2 + 512); + expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index d052e20eedb..cfab570833b 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture { std::vector prev_sinks; public: - LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()} + LoggerTest() + : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} { - cudf::logger().sinks() = {std::make_shared(oss)}; - cudf::logger().set_formatter( + cudf::detail::logger().sinks() = {std::make_shared(oss)}; + cudf::detail::logger().set_formatter( std::unique_ptr(new spdlog::pattern_formatter("%v"))); } ~LoggerTest() override { - cudf::logger().set_level(prev_level); - cudf::logger().sinks() = prev_sinks; + cudf::detail::logger().set_level(prev_level); + cudf::detail::logger().sinks() = prev_sinks; } void clear_sink() { oss.str(""); } @@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::logger().critical("crit msg"); + cudf::detail::logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); - cudf::logger().info("info"); - cudf::logger().warn("warn"); - cudf::logger().error("error"); - cudf::logger().critical("critical"); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); + cudf::detail::logger().error("error"); + cudf::detail::logger().critical("critical"); ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::logger().set_level(spdlog::level::warn); - cudf::logger().info("info"); - cudf::logger().warn("warn"); + cudf::detail::logger().set_level(spdlog::level::warn); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::logger().set_level(spdlog::level::debug); - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); + cudf::detail::logger().set_level(spdlog::level::debug); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e21536e2e97..052479d6720 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -49,3 +49,4 @@ This page provides API documentation for pylibcudf. io/index.rst strings/index.rst + nvtext/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst new file mode 100644 index 00000000000..abb45e426a8 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst @@ -0,0 +1,6 @@ +============= +edit_distance +============= + +.. automodule:: pylibcudf.nvtext.edit_distance + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst new file mode 100644 index 00000000000..b5cd5ee42c3 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -0,0 +1,7 @@ +nvtext +====== + +.. toctree:: + :maxdepth: 1 + + edit_distance diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx index e3c2273345a..3dd99c42d76 100644 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx @@ -2,37 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) +from pylibcudf cimport nvtext from cudf._lib.column cimport Column @acquire_spill_lock() def edit_distance(Column strings, Column targets): - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def edit_distance_matrix(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance_matrix( + strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 59bf40443f4..fbe37246656 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -3,9 +3,6 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from libcpp.memory cimport unique_ptr @@ -14,14 +11,6 @@ from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, to_floats as cpp_to_floats, @@ -406,77 +395,21 @@ def stoul(Column input_col): return string_to_integer(input_col, cudf.dtype("uint64")) -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) - - def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_booleans.to_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + ) + return Column.from_pylibcudf(plc_column) def from_booleans(Column input_col): - return _from_booleans(input_col) + plc_column = plc.strings.convert.convert_booleans.from_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + as_device_scalar("False").c_value, + ) + return Column.from_pylibcudf(plc_column) def int2timestamp( @@ -499,11 +432,10 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef string c_timestamp_format = format.encode("UTF-8") return Column.from_pylibcudf( plc.strings.convert.convert_datetime.from_timestamps( input_col.to_pylibcudf(mode="read"), - c_timestamp_format, + format, names.to_pylibcudf(mode="read") ) ) @@ -524,12 +456,11 @@ def timestamp2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_datetime.to_timestamps( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -551,16 +482,11 @@ def istimestamp(Column input_col, str format): """ if input_col.size == 0: return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_datetime.is_timestamp( + input_col.to_pylibcudf(mode="read"), + format + ) + return Column.from_pylibcudf(plc_column) def timedelta2int(Column input_col, dtype, format): diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index a8df8c9a92c..96dcd021c3b 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -1,22 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import cudf - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport ( - from_fixed_point as cpp_from_fixed_point, - is_fixed_point as cpp_is_fixed_point, - to_fixed_point as cpp_to_fixed_point, -) -from pylibcudf.libcudf.types cimport data_type, type_id - from cudf._lib.column cimport Column +from cudf._lib.types cimport dtype_to_pylibcudf_type + +import pylibcudf as plc @acquire_spill_lock() @@ -32,14 +21,10 @@ def from_decimal(Column input_col): ------- A column of strings representing the input decimal values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_fixed_point( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type): ------- A column of decimals parsed from the string values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef int scale = out_type.scale - cdef data_type c_out_type - if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(type_id.DECIMAL32, -scale) - elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(type_id.DECIMAL64, -scale) - elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(type_id.DECIMAL128, -scale) - else: - raise TypeError("should be a decimal dtype") - with nogil: - c_result = move( - cpp_to_fixed_point( - input_column_view, - c_out_type)) - - result = Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type), + ) + result = Column.from_pylibcudf(plc_column) result.dtype.precision = out_type.precision return result @@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype): ------- A Column of booleans indicating valid decimal conversion. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = input_col.view() - cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) - with nogil: - c_result = move(cpp_is_fixed_point( - source_view, - c_dtype - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(dtype), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 6e5abb2b82b..3d132c92d54 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -681,7 +681,7 @@ def _tile(A, reps): nval = len(value_vars) dtype = min_unsigned_type(nval) - if not var_name: + if var_name is None: var_name = "variable" if not value_vars: diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 4235affd4d1..3adbe1d2a74 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -119,6 +119,15 @@ def test_melt_str_scalar_id_var(): assert_eq(result, expected) +def test_melt_falsy_var_name(): + df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) + result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") + expected = pd.melt( + df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" + ) + assert_eq(result, expected) + + @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c401e5a2f17..54476b7fedc 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -914,7 +914,7 @@ def do_evaluate( col = self.children[0].evaluate(df, context=context, mapping=mapping) is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( - col.obj, format.encode() + col.obj, format ) if strict: @@ -937,7 +937,7 @@ def do_evaluate( ) return Column( plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype, format.encode() + res.columns()[0], self.dtype, format ) ) elif self.name == pl_expr.StringFunction.Replace: diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 2b208e2e021..5f9a04d3cee 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -41,6 +41,9 @@ set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(CUDF_EXPORT_NVCOMP OFF) +endif() set(CUDA_STATIC_RUNTIME ON) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a7cb66d7b16..1d72eacac12 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -66,3 +66,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow) add_subdirectory(libcudf) add_subdirectory(strings) add_subdirectory(io) +add_subdirectory(nvtext) diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index a384edd456d..b98b37fe0fd 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -78,4 +79,5 @@ __all__ = [ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 2a5365e8fad..304f27be340 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -28,6 +28,7 @@ lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -92,4 +93,5 @@ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 83a9573baad..e6688cfff81 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( - column_view input_col, + column_view input, string_scalar true_string) except + cdef unique_ptr[column] from_booleans( - column_view input_col, + column_view booleans, string_scalar true_string, string_scalar false_string) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index fa8975c4df9..fceddd58df0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( - column_view input_col, + column_view input, data_type timestamp_type, string format) except + cdef unique_ptr[column] from_timestamps( - column_view input_col, + column_view timestamps, string format, - column_view input_strings_names) except + + column_view names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd index 6f820f3c9a4..72ab329f2dd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd @@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_fixed_point( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] from_fixed_point( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] is_fixed_point( - column_view source_strings, - data_type output_type + column_view input, + data_type decimal_type ) except + diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt new file mode 100644 index 00000000000..ebe1fda1f12 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources edit_distance.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd new file mode 100644 index 00000000000..82f7c425b1d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport edit_distance + +__all__ = [ + "edit_distance", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py new file mode 100644 index 00000000000..986652a241f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import edit_distance + +__all__ = [ + "edit_distance", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd new file mode 100644 index 00000000000..446b95afabb --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column edit_distance(Column input, Column targets) + +cpdef Column edit_distance_matrix(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx new file mode 100644 index 00000000000..fc98ccbc50c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.edit_distance cimport ( + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix, +) + + +cpdef Column edit_distance(Column input, Column targets): + """ + Returns the edit distance between individual strings in two strings columns + + For details, see :cpp:func:`edit_distance` + + Parameters + ---------- + input : Column + Input strings + targets : Column + Strings to compute edit distance against + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef column_view c_targets = targets.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance(c_strings, c_targets)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column edit_distance_matrix(Column input): + """ + Returns the edit distance between all strings in the input strings column + + For details, see :cpp:func:`edit_distance_matrix` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance_matrix(c_strings)) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index cfea23e302a..727f3051cd6 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx convert_integers.pyx) +set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx + convert_fixed_point.pyx convert_integers.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 791980aab34..a4a8e88e946 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations, convert_integers +from . cimport ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_integers, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index fbed5a4e1f2..af51b7cc6cb 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations, convert_integers +from . import ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_integers, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd new file mode 100644 index 00000000000..312ac3c0ca0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column to_booleans(Column input, Scalar true_string) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx new file mode 100644 index 00000000000..0c10f821ab6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.convert cimport ( + convert_booleans as cpp_convert_booleans, +) +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column to_booleans(Column input, Scalar true_string): + """ + Returns a new bool column by parsing boolean values from the strings + in the provided strings column. + + For details, see :cpp:func:`cudf::strings::to_booleans`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + true_string : Scalar + String to expect for true. Non-matching strings are false + + Returns + ------- + Column + New bool column converted from strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): + """ + Returns a new strings column converting the boolean values from the + provided column into strings. + + For details, see :cpp:func:`cudf::strings::from_booleans`. + + Parameters + ---------- + booleans : Column + Boolean column to convert. + + true_string : Scalar + String to use for true in the output column. + + false_string : Scalar + String to use for false in the output column. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + cdef const string_scalar* c_false_string = ( + false_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index 07c84d263d6..80ec168644b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ) + +cpdef Column is_timestamp( + Column input, + str format, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index fcacb096f87..0ee60812e00 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -15,28 +15,74 @@ from pylibcudf.types import DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ): + """ + Returns a new timestamp column converting a strings column into + timestamps using the provided format pattern. + + For details, see cpp:`cudf::strings::to_timestamps`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + timestamp_type : DataType + The timestamp type used for creating the output column. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New datetime column + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ): + """ + Returns a new strings column converting a timestamp column into + strings using the provided format pattern. + + For details, see cpp:`cudf::strings::from_timestamps`. + + Parameters + ---------- + timestamps : Column + Timestamp values to convert + + format : str + The string specifying output format. + + input_strings_names : Column + The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + + Returns + ------- + Column + New strings column with formatted timestamps. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.from_timestamps( - input.view(), - format, + timestamps.view(), + c_format, input_strings_names.view() ) @@ -44,13 +90,33 @@ cpdef Column from_timestamps( cpdef Column is_timestamp( Column input, - const string& format + str format ): + """ + Verifies the given strings column can be parsed to timestamps + using the provided format pattern. + + For details, see cpp:`cudf::strings::is_timestamp`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New bool column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - format + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd new file mode 100644 index 00000000000..049b9b3fffe --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_fixed_point(Column input, DataType output_type) + +cpdef Column from_fixed_point(Column input) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx new file mode 100644 index 00000000000..40dadf6f967 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_fixed_point as cpp_fixed_point, +) +from pylibcudf.types cimport DataType, type_id + + +cpdef Column to_fixed_point(Column input, DataType output_type): + """ + Returns a new fixed-point column parsing decimal values from the + provided strings column. + + For details, see :cpp:details:`cudf::strings::to_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of fixed-point column to return including the scale value. + + Returns + ------- + Column + New column of output_type. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.to_fixed_point( + input.view(), + output_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_fixed_point(Column input): + """ + Returns a new strings column converting the fixed-point values + into a strings column. + + For details, see :cpp:details:`cudf::strings::from_fixed_point` + + Parameters + ---------- + input : Column + Fixed-point column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_fixed_point.from_fixed_point( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to fixed-point. + + For details, see :cpp:details:`cudf::strings::is_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + decimal_type : DataType + Fixed-point type (with scale) used only for checking overflow. + Defaults to Decimal64 + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if decimal_type is None: + decimal_type = DataType(type_id.DECIMAL64) + + with nogil: + c_result = move( + cpp_fixed_point.is_fixed_point( + input.view(), + decimal_type.c_obj, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py new file mode 100644 index 00000000000..7d93c471cc4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def edit_distance_data(): + arr1 = ["hallo", "goodbye", "world"] + arr2 = ["hello", "", "world"] + return pa.array(arr1), pa.array(arr2) + + +def test_edit_distance(edit_distance_data): + input_col, targets = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + ) + expected = pa.array([1, 7, 0], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_edit_distance_matrix(edit_distance_data): + input_col, _ = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance_matrix( + plc.interop.from_arrow(input_col) + ) + expected = pa.array( + [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32()) + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index e9e95459d0e..22bb4971cb1 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -62,7 +62,7 @@ def test_to_datetime( got = plc.strings.convert.convert_datetime.to_timestamps( plc_timestamp_col, plc.interop.from_arrow(timestamp_type), - format.encode(), + format, ) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py new file mode 100644 index 00000000000..117c59ff1b8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_booleans(): + pa_array = pa.array(["true", None, "True"]) + result = plc.strings.convert.convert_booleans.to_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("True")), + ) + expected = pa.array([False, None, True]) + assert_column_eq(result, expected) + + +def test_from_booleans(): + pa_array = pa.array([True, None, False]) + result = plc.strings.convert.convert_booleans.from_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("A")), + plc.interop.from_arrow(pa.scalar("B")), + ) + expected = pa.array(["A", None, "B"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py new file mode 100644 index 00000000000..f3e84286a36 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import datetime + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def fmt(): + return "%Y-%m-%dT%H:%M:%S" + + +def test_to_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None]) + result = plc.strings.convert.convert_datetime.to_timestamps( + plc.interop.from_arrow(arr), + plc.DataType(plc.TypeId.TIMESTAMP_SECONDS), + fmt, + ) + expected = pc.strptime(arr, fmt, "s") + assert_column_eq(result, expected) + + +def test_from_timestamp(fmt): + arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None]) + result = plc.strings.convert.convert_datetime.from_timestamps( + plc.interop.from_arrow(arr), + fmt, + plc.interop.from_arrow(pa.array([], type=pa.string())), + ) + # pc.strftime will add the extra %f + expected = pa.array(["2020-01-01T01:01:01", None]) + assert_column_eq(result, expected) + + +def test_is_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"]) + result = plc.strings.convert.convert_datetime.is_timestamp( + plc.interop.from_arrow(arr), + fmt, + ) + expected = pa.array([True, None, False]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py new file mode 100644 index 00000000000..b1c4d729604 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import decimal + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_fixed_point(): + typ = pa.decimal128(38, 2) + arr = pa.array(["123", "1.23", None]) + result = plc.strings.convert.convert_fixed_point.to_fixed_point( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_fixed_point(): + arr = pa.array([decimal.Decimal("1.1"), None]) + result = plc.strings.convert.convert_fixed_point.from_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["1.1", None]) + assert_column_eq(result, expected) + + +def test_is_fixed_point(): + arr = pa.array(["123", "1.23", "1.2.3", "", None]) + result = plc.strings.convert.convert_fixed_point.is_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py index 85abd3a2bae..a1c820cd586 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -7,6 +7,7 @@ def test_wrap(): + width = 12 pa_array = pa.array( [ "the quick brown fox jumped over the lazy brown dog", @@ -14,10 +15,10 @@ def test_wrap(): None, ] ) - result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), 12) + result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width) expected = pa.array( [ - textwrap.fill(val, 12) if isinstance(val, str) else val + textwrap.fill(val, width) if isinstance(val, str) else val for val in pa_array.to_pylist() ] )