diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh index f8ddbaba0f3..30e3ffc9a43 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -26,6 +26,8 @@ main() { LIBS=${LIBS#[} LIBS=${LIBS%]} + ANY_FAILURES=0 + for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for library $lib" @@ -56,10 +58,6 @@ main() { rapids-logger "Check GPU usage" nvidia-smi - EXITCODE=0 - trap "EXITCODE=1" ERR - set +e - rapids-logger "pytest ${lib}" NUM_PROCESSES=8 @@ -72,12 +70,20 @@ main() { fi done + EXITCODE=0 + trap "EXITCODE=1" ERR + set +e + TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib} + set -e rapids-logger "Test script exiting with value: ${EXITCODE}" + if [[ ${EXITCODE} != 0 ]]; then + ANY_FAILURES=1 + fi done - exit ${EXITCODE} + exit ${ANY_FAILURES} } main "$@" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3d77307ccde..78f529a44d3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -277,7 +277,7 @@ rapids_cpm_init() # Not using rapids-cmake since we never want to find, always download. CPMAddPackage( NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG - 14bb233d2420f7187a690f0bb528ec0420c70d48 + c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 ) rapids_make_logger(cudf EXPORT_SET cudf-exports) @@ -1105,7 +1105,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) ${_tgt} PRIVATE "$:${CUDF_CXX_FLAGS}>>" ) target_include_directories(${_tgt} PRIVATE "$") - target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm) + target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl) if(CUDF_BUILD_STACKTRACE_DEBUG) target_link_libraries(${_tgt} PRIVATE cudf_backtrace) endif() diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8e5ea900efa..749e1b628ee 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) endfunction() # ################################################################################################## -# * column benchmarks ----------------------------------------------------------------------------- -ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp) +# * copying benchmarks +# ----------------------------------------------------------------------------- +ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp) # ################################################################################################## # * gather benchmark ------------------------------------------------------------------------------ @@ -351,11 +352,18 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/subword.cpp) - ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH + text/edit_distance.cpp + text/hash_ngrams.cpp + text/jaccard.cpp + text/minhash.cpp + text/ngrams.cpp + text/normalize.cpp + text/replace.cpp + text/subword.cpp + text/tokenize.cpp + text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp deleted file mode 100644 index 51106c72137..00000000000 --- a/cpp/benchmarks/column/concatenate.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -class Concatenate : public cudf::benchmark {}; - -template -static void BM_concatenate(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - - auto input = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - auto input_columns = input->view(); - std::vector column_views(input_columns.begin(), input_columns.end()); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); -} - -#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_BENCHMARK_DEFINE(int64_t, false) -CONCAT_BENCHMARK_DEFINE(int64_t, true) - -template -static void BM_concatenate_tables(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - cudf::size_type const num_tables = state.range(2); - - std::vector> tables(num_tables); - std::generate_n(tables.begin(), num_tables, [&]() { - return create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - }); - - // Generate table views - std::vector table_views(num_tables); - std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable { - return table->view(); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(table_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); -} - -#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_tables(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) - -class ConcatenateStrings : public cudf::benchmark {}; - -template -static void BM_concatenate_strings(benchmark::State& state) -{ - using column_wrapper = cudf::test::strings_column_wrapper; - - auto const num_rows = state.range(0); - auto const num_chars = state.range(1); - auto const num_cols = state.range(2); - - std::string str(num_chars, 'a'); - - // Create owning columns - std::vector columns; - columns.reserve(num_cols); - std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() { - auto iter = thrust::make_constant_iterator(c_str); - if (Nullable) { - auto count_it = thrust::make_counting_iterator(0); - auto valid_iter = - thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; }); - return column_wrapper(iter, iter + num_rows, valid_iter); - } else { - return column_wrapper(iter, iter + num_rows); - } - }); - - // Generate column views - std::vector column_views; - column_views.reserve(columns.size()); - std::transform( - columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) { - return static_cast(col); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * - (sizeof(int32_t) + num_chars)); // offset + chars -} - -#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_strings(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_STRINGS_BENCHMARK_DEFINE(false) -CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp new file mode 100644 index 00000000000..586b479d0ad --- /dev/null +++ b/cpp/benchmarks/copying/concatenate.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +#include + +#include + +static void bench_concatenate(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = static_cast(state.get_float64("nulls")); + + auto input = create_sequence_table( + cycle_dtypes({cudf::type_to_id()}, num_cols), row_count{num_rows}, nulls); + auto input_columns = input->view(); + auto column_views = std::vector(input_columns.begin(), input_columns.end()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_reads(num_rows * num_cols); + state.add_global_memory_writes(num_rows * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate) + .set_name("concatenate") + .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144}) + .add_int64_axis("num_cols", {2, 8, 64, 512, 1024}) + .add_float64_axis("nulls", {0.0, 0.3}); + +static void bench_concatenate_strings(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const nulls = static_cast(state.get_float64("nulls")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .null_probability(nulls); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = column->view(); + + auto column_views = std::vector(num_cols, input); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const sv = cudf::strings_column_view(input); + state.add_global_memory_reads(sv.chars_size(stream) * num_cols); + state.add_global_memory_writes(sv.chars_size(stream) * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate_strings) + .set_name("concatenate_strings") + .add_int64_axis("num_rows", {256, 512, 4096, 16384}) + .add_int64_axis("num_cols", {2, 8, 64, 256}) + .add_int64_axis("row_width", {32, 128}) + .add_float64_axis("nulls", {0.0, 0.3}); diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index cd4d3ca964b..9750475a079 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -24,18 +24,14 @@ void bench_case(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const max_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const encoding = state.get_string("encoding"); - if (static_cast(n_rows) * static_cast(max_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); auto col_view = column->view(); @@ -74,6 +70,7 @@ void bench_case(nvbench::state& state) NVBENCH_BENCH(bench_case) .set_name("case") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index eec9a5f54d7..abc5254392e 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -25,16 +25,12 @@ static void bench_char_types(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const api_type = state.get_string("api"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state) NVBENCH_BENCH(bench_char_types) .set_name("char_types") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index a73017dda18..e3940cbc0c7 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43" static void bench_contains(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto col = create_string_column(n_rows, row_width, hit_rate); + auto col = create_string_column(num_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; @@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state) NVBENCH_BENCH(bench_contains) .set_name("contains") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {50, 100}) // percentage .add_int64_axis("pattern", {0, 1, 2}); diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp index e06cca497c2..5a5743dfddf 100644 --- a/cpp/benchmarks/string/copy_if_else.cpp +++ b/cpp/benchmarks/string/copy_if_else.cpp @@ -25,15 +25,11 @@ static void bench_copy(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const str_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const source_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile); auto const target_table = @@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state) NVBENCH_BENCH(bench_copy) .set_name("copy_if_else") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp index af217a49195..7e7353a0e78 100644 --- a/cpp/benchmarks/string/copy_range.cpp +++ b/cpp/benchmarks/string/copy_range.cpp @@ -25,16 +25,12 @@ static void bench_copy_range(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const source_tables = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile); @@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state) NVBENCH_BENCH(bench_copy_range) .set_name("copy_range") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp index f964bc5d224..cf90e316f71 100644 --- a/cpp/benchmarks/string/count.cpp +++ b/cpp/benchmarks/string/count.cpp @@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"}; static void bench_count(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state) NVBENCH_BENCH(bench_count) .set_name("count") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("pattern", {0, 1}); diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index af4fedb5799..d6866598ff4 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto groups = static_cast(state.get_int64("groups")); std::default_random_engine generator; @@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state) NVBENCH_BENCH(bench_extract) .set_name("extract") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("groups", {1, 2, 4}); diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp index 6dcf731ad3c..27652193b7b 100644 --- a/cpp/benchmarks/string/join_strings.cpp +++ b/cpp/benchmarks/string/join_strings.cpp @@ -25,15 +25,11 @@ static void bench_join(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state) NVBENCH_BENCH(bench_join) .set_name("strings_join") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp index a19060ead3b..8156e19412b 100644 --- a/cpp/benchmarks/string/lengths.cpp +++ b/cpp/benchmarks/string/lengths.cpp @@ -25,15 +25,11 @@ static void bench_lengths(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state) NVBENCH_BENCH(bench_lengths) .set_name("lengths") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 105ae65cbe8..f6410aaef30 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); @@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state) NVBENCH_BENCH(bench_like) .set_name("strings_like") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {10, 25, 70, 100}); diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index 4dcf1314f83..69426a2d484 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -26,18 +26,14 @@ static void bench_replace(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const rtype = state.get_string("type"); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto program = cudf::strings::regex_program::create("(\\d+)"); @@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"replace", "backref"}); diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp index a2676609a40..e2e914cb350 100644 --- a/cpp/benchmarks/string/reverse.cpp +++ b/cpp/benchmarks/string/reverse.cpp @@ -25,15 +25,11 @@ static void bench_reverse(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state) NVBENCH_BENCH(bench_reverse) .set_name("reverse") - .add_int64_axis("row_width", {8, 16, 32, 64, 128}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index 1898f0340b6..c828a8ed0b0 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); @@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state) NVBENCH_BENCH(bench_slice) .set_name("slice") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"position", "multi"}); diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 9ef58daf0fc..9c7c27c4f07 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -28,16 +28,12 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); cudf::string_scalar target("+"); @@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"split", "split_ws", "record", "record_ws"}); diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp index 1fdb6e67109..34a7aa96e84 100644 --- a/cpp/benchmarks/string/split_re.cpp +++ b/cpp/benchmarks/string/split_re.cpp @@ -28,17 +28,13 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto prog = cudf::strings::regex_program::create("\\d+"); data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp deleted file mode 100644 index a34026281e8..00000000000 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -/** - * @brief Generate row count and row length argument ranges for a string benchmark. - * - * Generates a series of row count and row length arguments for string benchmarks. - * Combinations of row count and row length that would exceed the maximum string character - * column data length are not generated. - * - * @param b Benchmark to update with row count and row length arguments. - * @param min_rows Minimum row count argument to generate. - * @param max_rows Maximum row count argument to generate. - * @param rows_mult Row count multiplier to generate intermediate row count arguments. - * @param min_rowlen Minimum row length argument to generate. - * @param max_rowlen Maximum row length argument to generate. - * @param rowlen_mult Row length multiplier to generate intermediate row length arguments. - */ -inline void generate_string_bench_args(benchmark::internal::Benchmark* b, - int min_rows, - int max_rows, - int rows_mult, - int min_rowlen, - int max_rowlen, - int rowlen_mult) -{ - for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } - } -} diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 6ffa90edb8f..0ad1ae30f8c 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -27,15 +27,11 @@ static void bench_edit_distance(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input1(strings_table->view().column(0)); @@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state) NVBENCH_BENCH(bench_edit_distance) .set_name("edit_distance") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144}); diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp index 4e5daf83a3c..7577cf00c0f 100644 --- a/cpp/benchmarks/text/hash_ngrams.cpp +++ b/cpp/benchmarks/text/hash_ngrams.cpp @@ -27,16 +27,12 @@ static void bench_hash_ngrams(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const ngrams = static_cast(state.get_int64("ngrams")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); @@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state) NVBENCH_BENCH(bench_hash_ngrams) .set_name("hash_ngrams") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 2048}) + .add_int64_axis("num_rows", {16384, 32768, 262144}) .add_int64_axis("ngrams", {5, 10}); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index d5b74da6773..5506501138b 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -28,17 +28,13 @@ static void bench_jaccard(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const substring_width = static_cast(state.get_int64("substring_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const input_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); @@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state) NVBENCH_BENCH(bench_jaccard) .set_name("jaccard") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 1024, 2048}) .add_int64_axis("num_rows", {32768, 131072, 262144}) - .add_int64_axis("row_width", {128, 512, 1024, 2048}) .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 71bccd80d39..594dc0de28a 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -28,16 +28,12 @@ static void bench_normalize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const normalize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp index 767ebab3eee..24ca4e5dfd7 100644 --- a/cpp/benchmarks/text/replace.cpp +++ b/cpp/benchmarks/text/replace.cpp @@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - std::vector words{" ", "one ", "two ", "three ", "four ", "five ", "six ", "sevén ", "eight ", "nine ", "ten ", "eleven ", "twelve ", "thirteen ", "fourteen ", @@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index dd8df695d3e..0b4e3bdefa5 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,6 @@ * limitations under the License. */ -#include -#include - #include #include @@ -24,6 +21,8 @@ #include +#include + #include #include #include @@ -54,40 +53,33 @@ static std::string create_hash_vocab_file() return hash_file; } -static void BM_subword_tokenizer(benchmark::State& state) +static void bench_subword_tokenizer(nvbench::state& state) { - auto const nrows = static_cast(state.range(0)); - std::vector h_strings(nrows, "This is a test "); + auto const num_rows = static_cast(state.get_int64("num_rows")); + + std::vector h_strings(num_rows, "This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); static std::string hash_file = create_hash_vocab_file(); std::vector offsets{14}; - uint32_t max_sequence_length = 64; - uint32_t stride = 48; - uint32_t do_truncate = 0; - uint32_t do_lower = 1; - // - auto vocab = nvtext::load_vocabulary_file(hash_file); - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - *vocab, - max_sequence_length, - stride, - do_lower, - do_truncate); - } -} + uint32_t max_sequence = 64; + uint32_t stride = 48; + uint32_t do_truncate = 0; + uint32_t do_lower = 1; -class Subword : public cudf::benchmark {}; + auto input = cudf::strings_column_view{strings}; -#define SUBWORD_BM_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \ - BENCHMARK_REGISTER_F(Subword, name) \ - ->RangeMultiplier(2) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows * max_sequence); -SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer); + auto vocab = nvtext::load_vocabulary_file(hash_file); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = + nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate); + }); +} -// BENCHMARK_MAIN(); +NVBENCH_BENCH(bench_subword_tokenizer) + .set_name("subword_tokenize") + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index e83310e0343..b9590c5539f 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -31,17 +31,13 @@ static void bench_tokenize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const tokenize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_tokenize) .set_name("tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"}); diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp index 523d277df18..0502f375d99 100644 --- a/cpp/benchmarks/text/vocab.cpp +++ b/cpp/benchmarks/text/vocab.cpp @@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state) { auto const stream = cudf::get_default_stream(); auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto const column = [num_rows, row_width] { + auto const column = [num_rows, min_width, max_width] { data_profile const profile = data_profile_builder().no_validity().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); return cudf::strings::filter_characters_of_type( cudf::strings_column_view(col->view()), @@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_vocab_tokenize) .set_name("vocab_tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp deleted file mode 100644 index adc3dddc59c..00000000000 --- a/cpp/benchmarks/text/word_minhash.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -#include - -#include - -static void bench_word_minhash(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); - auto const base64 = state.get_int64("hash_type") == 64; - - data_profile const strings_profile = - data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); - auto strings_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); - - auto const num_offsets = (num_rows / row_width) + 1; - auto offsets = cudf::sequence(num_offsets, - cudf::numeric_scalar(0), - cudf::numeric_scalar(row_width)); - - auto source = cudf::make_lists_column(num_offsets - 1, - std::move(offsets), - std::move(strings_table->release().front()), - 0, - rmm::device_buffer{}); - - data_profile const seeds_profile = data_profile_builder().no_validity().distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, 256); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - - cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); - auto chars_size = input.chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); - state.add_global_memory_writes(num_rows); // output are hashes - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) - : nvtext::word_minhash(source->view(), seeds.view()); - }); -} - -NVBENCH_BENCH(bench_word_minhash) - .set_name("word_minhash") - .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) - .add_int64_axis("row_width", {10, 100, 1000}) - .add_int64_axis("seed_count", {2, 25}) - .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 5dc75b1a3fb..a7efb4e6e93 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - int const warp_id = tidx / cudf::detail::warp_size; - size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const warp_id = tidx / cudf::detail::warp_size; + auto const warps_per_grid = stride / cudf::detail::warp_size; // begin/end indices for the column data size_type const begin = 0; @@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % cudf::detail::warp_size; + auto const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 5ea0d06039f..1bfb40e5916 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre CUDF_EXPECTS(data_type(type_to_id()) == col_view.type(), "get_value data type mismatch"); CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); - T result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &result, col_view.data() + element_index, sizeof(T), cudaMemcpyDefault, stream.value())); - stream.synchronize(); - return result; + return cudf::detail::make_host_vector_sync( + device_span{col_view.data() + element_index, 1}, stream) + .front(); } } // namespace detail diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 16d532ea2b8..4f6238b5fe7 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -16,6 +16,8 @@ #pragma once #include +#include +#include #include #include #include @@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st // A buffer of CPU memory is allocated to hold the ColumnDeviceView // objects. Once filled, the CPU memory is then copied to device memory // and the pointer is set in the d_columns member. - std::vector h_buffer(padded_views_size_bytes); + auto h_buffer = cudf::detail::make_host_vector(padded_views_size_bytes, stream); // Each ColumnDeviceView instance may have child objects which may // require setting some internal device pointers before being copied // from CPU to device. @@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st auto d_columns = detail::child_columns_to_device_array( source_view.begin(), source_view.end(), h_ptr, d_ptr); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + auto const h_span = host_span{h_buffer}.subspan( + static_cast(h_ptr) - h_buffer.data(), views_size_bytes); + auto const d_span = device_span{static_cast(d_ptr), views_size_bytes}; + cudf::detail::cuda_memcpy(d_span, h_span, stream); return std::make_tuple(std::move(descendant_storage), d_columns); } diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index d8419760120..6fc49afd7ac 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -308,7 +308,11 @@ std::unique_ptr for_each_concatenate(host_span views, auto count = 0; for (auto& v : views) { - thrust::copy(rmm::exec_policy(stream), v.begin(), v.end(), m_view.begin() + count); + cudaMemcpyAsync(m_view.begin() + count, + v.begin(), + v.size() * sizeof(T), + cudaMemcpyDeviceToDevice, + stream.value()); count += v.size(); } diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 2f6942fe139..cc5f256ea80 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -464,17 +464,6 @@ std::unique_ptr make_all_nulls_column(schema_element const& schema, */ column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); -/** - * @brief Get the path data type of a column by path if present in input schema - * - * @param path path of the column - * @param options json reader options which holds schema - * @return data type of the column if present - */ -std::optional get_path_data_type( - host_span const> path, - cudf::io::json_reader_options const& options); - /** * @brief Helper class to get path of a column by column id from reduced column tree * diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 2da320b2af3..4b4827ca8d9 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -68,78 +68,6 @@ void json_reader_options::set_dtypes(schema_element types) } // namespace cudf::io namespace cudf::io::json::detail { -namespace { - -// example schema and its path. -// "a": int {"a", int} -// "a": [ int ] {"a", list}, {"element", int} -// "a": { "b": int} {"a", struct}, {"b", int} -// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int} -// "a": [ null] {"a", list}, {"element", str} -// back() is root. -// front() is leaf. -/** - * @brief Get the path data type of a column by path if present in input schema - * - * @param path path of the json column - * @param root root of input schema element - * @return data type of the column if present, otherwise std::nullopt - */ -std::optional get_path_data_type( - host_span const> path, schema_element const& root) -{ - if (path.empty() || path.size() == 1) { - return root.type; - } else { - if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) { - auto const child_name = path.first(path.size() - 1).back().first; - auto const child_schema_it = root.child_types.find(child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) { - auto const child_schema_it = root.child_types.find(list_child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } - return std::optional{}; - } -} - -std::optional child_schema_element(std::string const& col_name, - cudf::io::json_reader_options const& options) -{ - return std::visit( - cudf::detail::visitor_overload{ - [col_name](std::vector const& user_dtypes) -> std::optional { - auto column_index = atol(col_name.data()); - return (static_cast(column_index) < user_dtypes.size()) - ? std::optional{{user_dtypes[column_index]}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; - }, - [col_name](schema_element const& user_dtypes) -> std::optional { - return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) - ? user_dtypes.child_types.find(col_name)->second - : std::optional{}; - }}, - options.get_dtypes()); -} - -} // namespace - /// Created an empty column of the specified schema struct empty_column_functor { rmm::cuda_stream_view stream; @@ -311,48 +239,4 @@ column_name_info make_column_name_info(schema_element const& schema, std::string } return info; } - -std::optional get_path_data_type( - host_span const> path, - cudf::io::json_reader_options const& options) -{ - if (path.empty()) return {}; - std::optional col_schema = child_schema_element(path.back().first, options); - // check if it has value, then do recursive call and return. - if (col_schema.has_value()) { - return get_path_data_type(path, col_schema.value()); - } else { - return {}; - } -} - -// idea: write a memoizer using template and lambda?, then call recursively. -std::vector path_from_tree::get_path(NodeIndexT this_col_id) -{ - std::vector path; - // stops at root. - while (this_col_id != parent_node_sentinel) { - auto type = column_categories[this_col_id]; - std::string name = ""; - // code same as name_and_parent_index lambda. - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } - // "name": type/schema - path.emplace_back(name, type); - this_col_id = parent_col_id; - if (this_col_id == row_array_parent_col_id) return path; - } - return {}; -} - } // namespace cudf::io::json::detail diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 65cd6e25002..6b9c19368dc 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -1386,29 +1387,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, // we know the size of each array. The number of stripes per column in a chunk array can // be calculated by dividing the number of chunks by the number of columns. // That many chunks need to be copied at a time to the proper destination. - size_t num_entries_seen = 0; + size_t num_entries_seen = 0; + auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2; + auto h_srcs = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_dsts = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_lens = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) { auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns; - auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk); - auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group); for (size_t col = 0; col < num_columns; ++col) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col, - chunk_bytes, - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY( - cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col, - merge_bytes, - cudaMemcpyDefault, - stream.value())); + h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col); + h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_chunk)); + + h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col); + h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group)); } num_entries_seen += stripes_per_col; } + auto const& mr = cudf::get_current_device_resource_ref(); + auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr); + auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr); + auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr); + cudf::detail::batched_memcpy_async( + d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream); + auto file_stats_merge = cudf::detail::make_host_vector(num_file_blobs, stream); for (auto i = 0u; i < num_file_blobs; ++i) { diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index ebab3beb08f..d6b85db3f0f 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -138,7 +138,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, auto const stride = cudf::detail::grid_1d::grid_stride(); // Initialize local histogram - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_sizes[partition_number] = 0; partition_number += blockDim.x; @@ -207,7 +207,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit extern __shared__ size_type shared_partition_offsets[]; // Initialize array of this blocks offsets from global array - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_offsets[partition_number] = block_partition_offsets[partition_number * gridDim.x + blockIdx.x]; @@ -303,7 +303,8 @@ CUDF_KERNEL void copy_block_partitions(InputIter input_iter, // Fetch the offset in the output buffer of each partition in this thread // block - for (size_type ipartition = threadIdx.x; ipartition < num_partitions; ipartition += blockDim.x) { + for (thread_index_type ipartition = threadIdx.x; ipartition < num_partitions; + ipartition += blockDim.x) { partition_offset_global[ipartition] = scanned_block_partition_sizes[ipartition * gridDim.x + blockIdx.x]; } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d27420658d6..2128bacff80 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -385,7 +385,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, size_type const* group_cluster_offsets, bool has_nulls) { - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const group_index = tid; if (group_index >= num_groups) { return; } diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index 4fd0369c26b..9d96c11c3f2 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -38,8 +38,9 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data { // cannot use global_thread_id utility due to a JIT build issue by including // the `cudf/detail/utilities/cuda.cuh` header - thread_index_type const start = threadIdx.x + blockIdx.x * blockDim.x; - thread_index_type const stride = blockDim.x * gridDim.x; + auto const block_size = static_cast(blockDim.x); + thread_index_type const start = threadIdx.x + blockIdx.x * block_size; + thread_index_type const stride = block_size * gridDim.x; for (auto i = start; i < static_cast(size); i += stride) { GENERIC_UNARY_OP(&out_data[i], in_data[i]); diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 66bbe532e46..39c11295fbd 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -413,7 +413,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span col size_type max_branch_depth) { extern __shared__ row_span thread_branch_stacks[]; - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = static_cast(cudf::detail::grid_1d::global_thread_id()); auto const num_segments = static_cast(output.size()); if (tid >= num_segments) { return; } diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu index e95c9fb41c6..9f8d22ea94d 100644 --- a/cpp/tests/bitmask/set_nullmask_tests.cu +++ b/cpp/tests/bitmask/set_nullmask_tests.cu @@ -31,6 +31,7 @@ #include #include +namespace { struct valid_bit_functor { cudf::bitmask_type const* _null_mask; __device__ bool operator()(cudf::size_type element_index) const noexcept @@ -38,13 +39,7 @@ struct valid_bit_functor { return cudf::bit_is_set(_null_mask, element_index); } }; - -std::ostream& operator<<(std::ostream& stream, thrust::host_vector const& bits) -{ - for (auto _bit : bits) - stream << int(_bit); - return stream; -} +} // namespace struct SetBitmaskTest : public cudf::test::BaseFixture { void expect_bitmask_equal(cudf::bitmask_type const* bitmask, // Device Ptr diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu index 96f122f21a8..8ffcc552ecb 100644 --- a/cpp/tests/bitmask/valid_if_tests.cu +++ b/cpp/tests/bitmask/valid_if_tests.cu @@ -28,6 +28,7 @@ struct ValidIfTest : public cudf::test::BaseFixture {}; +namespace { struct odds_valid { __host__ __device__ bool operator()(cudf::size_type i) { return i % 2; } }; @@ -37,6 +38,7 @@ struct all_valid { struct all_null { __host__ __device__ bool operator()(cudf::size_type i) { return false; } }; +} // namespace TEST_F(ValidIfTest, EmptyRange) { diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp index 5570a7d498c..1f29ea9e5fc 100644 --- a/cpp/tests/column/bit_cast_test.cpp +++ b/cpp/tests/column/bit_cast_test.cpp @@ -25,6 +25,7 @@ #include +namespace { template struct rep_type_impl { using type = void; @@ -47,12 +48,14 @@ struct rep_type_impl()>> { template using rep_type_t = typename rep_type_impl::type; +} // namespace template struct ColumnViewAllTypesTests : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes); +namespace { template void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator end) { @@ -102,6 +105,7 @@ void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator } } } +} // namespace TYPED_TEST(ColumnViewAllTypesTests, BitCast) { diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu index d7e93fb22a3..fff3282fdd5 100644 --- a/cpp/tests/column/compound_test.cu +++ b/cpp/tests/column/compound_test.cu @@ -34,6 +34,7 @@ struct CompoundColumnTest : public cudf::test::BaseFixture {}; +namespace { template struct checker_for_level1 { ColumnDeviceView d_column; @@ -62,6 +63,7 @@ struct checker_for_level2 { return bcheck; } }; +} // namespace TEST_F(CompoundColumnTest, ChildrenLevel1) { diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index b81f8196d89..2fb24f6b31e 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -31,6 +31,7 @@ #include +namespace { template CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) { @@ -109,6 +110,7 @@ std::enable_if_t(), T> accumulate(cudf::host_span xs.begin(), xs.end(), ys.begin(), [](T const& ts) { return ts.time_since_epoch().count(); }); return T{typename T::duration{std::accumulate(ys.begin(), ys.end(), 0)}}; } +} // namespace template struct AtomicsTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index b96c6909e55..f8f8d525043 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -577,10 +577,12 @@ TEST_F(FixedPointTest, Decimal32FloatVector) float_vector_test(0.15, 20, -2, std::multiplies<>()); } +namespace { struct cast_to_int32_fn { using decimal32 = fixed_point; int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast(fp); } }; +} // namespace TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper) { diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu index f34760341d8..ddc48c97012 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cu +++ b/cpp/tests/fixed_point/fixed_point_tests.cu @@ -72,10 +72,12 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust) EXPECT_EQ(vec2, vec3); } +namespace { struct cast_to_int32_fn { using decimal32 = fixed_point; int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast(fp); } }; +} // namespace TEST_F(FixedPointTest, DecimalXXThrustOnDevice) { diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 4ae5d06b214..883a5093bd1 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -30,6 +30,7 @@ #include #include +namespace { /** * @brief Functor to generate a tdigest by key. * @@ -116,6 +117,7 @@ struct tdigest_groupby_simple_merge_op { return std::move(result.second[0].results[0]); } }; +} // namespace template struct TDigestAllTypes : public cudf::test::BaseFixture {}; @@ -508,6 +510,7 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } +namespace { std::unique_ptr do_agg( cudf::column_view key, cudf::column_view val, @@ -537,6 +540,7 @@ std::unique_ptr do_agg( return std::make_unique(std::move(result_columns)); } +} // namespace TEST_F(TDigestMergeTest, AllValuesAreNull) { diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index ef4b9dd9b8a..b7106e823dd 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -26,6 +26,7 @@ #include +namespace { struct dlpack_deleter { void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); } }; @@ -60,6 +61,7 @@ void validate_dtype(DLDataType const& dtype) EXPECT_EQ(1, dtype.lanes); EXPECT_EQ(sizeof(T) * 8, dtype.bits); } +} // namespace class DLPackUntypedTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 887d4fa783f..5201a46ba7d 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -34,6 +34,8 @@ namespace cuio_json = cudf::io::json; +namespace { + // Host copy of tree_meta_t struct tree_meta_t2 { std::vector node_categories; @@ -43,8 +45,6 @@ struct tree_meta_t2 { std::vector node_range_end; }; -namespace { - tree_meta_t2 to_cpu_tree(cuio_json::tree_meta_t const& d_value, rmm::cuda_stream_view stream) { return {cudf::detail::make_std_vector_async(d_value.node_categories, stream), diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu index f988ae24b38..a67830a7864 100644 --- a/cpp/tests/io/json/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -36,6 +36,8 @@ namespace cuio_json = cudf::io::json; +namespace { + struct h_tree_meta_t { std::vector node_categories; std::vector parent_node_ids; @@ -222,6 +224,7 @@ void run_test(std::string const& input, bool enable_lines = true) // assert equality between csr and meta formats ASSERT_TRUE(iseq); } +} // namespace struct JsonColumnTreeTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index 153a8a0c5aa..369376b6c95 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -1074,6 +1074,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount) } while (reader.has_next()); } +namespace { constexpr size_t input_limit_expected_file_count = 4; std::vector input_limit_get_test_names(std::string const& base_filename) @@ -1133,6 +1134,7 @@ void input_limit_test_read(std::vector const& test_filenames, CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t); } } +} // namespace struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {}; @@ -1189,6 +1191,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns) struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {}; +namespace { struct offset_gen { int const group_size; __device__ int operator()(int i) { return i * group_size; } @@ -1198,6 +1201,8 @@ template struct value_gen { __device__ T operator()(int i) { return i % 1024; } }; +} // namespace + TEST_F(ParquetChunkedReaderInputLimitTest, List) { auto base_path = temp_env->get_temp_filepath("list"); @@ -1263,6 +1268,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List) input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c); } +namespace { void tiny_list_rowgroup_test(bool just_list_col) { auto iter = thrust::make_counting_iterator(0); @@ -1320,6 +1326,7 @@ void tiny_list_rowgroup_test(bool just_list_col) CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *(result.first)); } +} // namespace TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsSingle) { @@ -1333,6 +1340,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsMixed) tiny_list_rowgroup_test(false); } +namespace { struct char_values { __device__ int8_t operator()(int i) { @@ -1341,6 +1349,8 @@ struct char_values { return index == 0 ? 'a' : (index == 1 ? 'b' : 'c'); } }; +} // namespace + TEST_F(ParquetChunkedReaderInputLimitTest, Mixed) { auto base_path = temp_env->get_temp_filepath("mixed_types"); diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu index 257c0979017..8377060b6ec 100644 --- a/cpp/tests/iterator/optional_iterator_test_numeric.cu +++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu @@ -26,16 +26,6 @@ using TestingTypes = cudf::test::NumericTypes; -namespace cudf { -// To print meanvar for debug. -// Needs to be in the cudf namespace for ADL -template -std::ostream& operator<<(std::ostream& os, cudf::meanvar const& rhs) -{ - return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] "; -}; -} // namespace cudf - template struct NumericOptionalIteratorTest : public IteratorTest {}; @@ -46,6 +36,7 @@ TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator) } TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); } +namespace { // Transformers and Operators for optional_iterator test template struct transformer_optional_meanvar { @@ -65,6 +56,7 @@ template struct optional_to_meanvar { CUDF_HOST_DEVICE inline T operator()(cuda::std::optional const& v) { return v.value_or(T{0}); } }; +} // namespace // TODO: enable this test also at __CUDACC_DEBUG__ // This test causes fatal compilation error only at device debug mode. diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu index 3447aa0dde6..5f707232953 100644 --- a/cpp/tests/iterator/pair_iterator_test_numeric.cu +++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu @@ -24,16 +24,6 @@ using TestingTypes = cudf::test::NumericTypes; -namespace cudf { -// To print meanvar for debug. -// Needs to be in the cudf namespace for ADL -template -std::ostream& operator<<(std::ostream& os, cudf::meanvar const& rhs) -{ - return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] "; -}; -} // namespace cudf - template struct NumericPairIteratorTest : public IteratorTest {}; @@ -53,6 +43,7 @@ struct transformer_pair_meanvar { }; }; +namespace { struct sum_if_not_null { template CUDF_HOST_DEVICE inline thrust::pair operator()(thrust::pair const& lhs, @@ -66,6 +57,7 @@ struct sum_if_not_null { return {rhs}; } }; +} // namespace // TODO: enable this test also at __CUDACC_DEBUG__ // This test causes fatal compilation error only at device debug mode. diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 37414eb3fba..c146fd2ea4e 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -33,6 +33,7 @@ #include +namespace { std::unique_ptr arrow_percentile_approx(cudf::column_view const& _values, int delta, std::vector const& percentages) @@ -315,6 +316,7 @@ cudf::data_type get_appropriate_type() if constexpr (cudf::is_fixed_point()) { return cudf::data_type{cudf::type_to_id(), -7}; } return cudf::data_type{cudf::type_to_id()}; } +} // namespace using PercentileApproxTypes = cudf::test::Concat; diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu index c8fec51e1c9..184725e17e0 100644 --- a/cpp/tests/reductions/tdigest_tests.cu +++ b/cpp/tests/reductions/tdigest_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ template struct ReductionTDigestAllTypes : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(ReductionTDigestAllTypes, cudf::test::NumericTypes); +namespace { struct reduce_op { std::unique_ptr operator()(cudf::column_view const& values, int delta) const { @@ -60,6 +61,7 @@ struct reduce_merge_op { return cudf::make_structs_column(tbl.num_rows(), std::move(cols), 0, rmm::device_buffer()); } }; +} // namespace TYPED_TEST(ReductionTDigestAllTypes, Simple) { diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp index 7133baf6df1..79ea6b7d6d4 100644 --- a/cpp/tests/streams/interop_test.cpp +++ b/cpp/tests/streams/interop_test.cpp @@ -23,9 +23,11 @@ #include +namespace { struct dlpack_deleter { void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); } }; +} // namespace struct DLPackTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 01a042130d6..7e203086fca 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -590,6 +590,7 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists) cudf::test::fixed_width_column_wrapper{32, 32, 32, 32}); } +namespace { struct sum_functor { cudf::size_type const* s0; cudf::size_type const* s1; @@ -597,6 +598,7 @@ struct sum_functor { cudf::size_type operator() __device__(int i) { return s0[i] + s1[i] + s2[i]; } }; +} // namespace TEST_F(RowBitCount, Table) { diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu index 4086c5a91bb..8e5129dfbd2 100644 --- a/cpp/tests/wrappers/timestamps_test.cu +++ b/cpp/tests/wrappers/timestamps_test.cu @@ -37,6 +37,7 @@ #include #include +namespace { template struct ChronoColumnTest : public cudf::test::BaseFixture { cudf::size_type size() { return cudf::size_type(100); } @@ -72,6 +73,7 @@ struct compare_chrono_elements_to_primitive_representation { return primitive == dur.count(); } }; +} // namespace TYPED_TEST_SUITE(ChronoColumnTest, cudf::test::ChronoTypes); @@ -103,6 +105,7 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation) *cudf::column_device_view::create(chrono_col)})); } +namespace { template struct compare_chrono_elements { cudf::binary_operator comp; @@ -129,6 +132,7 @@ struct compare_chrono_elements { } } }; +} // namespace TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) { diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java index 286b5c208c9..f3155bc5860 100644 --- a/java/src/main/java/ai/rapids/cudf/Scalar.java +++ b/java/src/main/java/ai/rapids/cudf/Scalar.java @@ -521,13 +521,28 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host private static native long makeStructScalar(long[] viewHandles, boolean isValid); private static native long repeatString(long scalarHandle, int repeatTimes); - Scalar(DType type, long scalarHandle) { + /** + * Constructor to create a scalar from a native handle and a type. + * + * @param type The type of the scalar + * @param scalarHandle The native handle (pointer address) to the scalar data + */ + public Scalar(DType type, long scalarHandle) { this.type = type; this.offHeap = new OffHeapState(scalarHandle); MemoryCleaner.register(this, offHeap); incRefCount(); } + /** + * Get the native handle (native pointer address) for the scalar. + * + * @return The native handle + */ + public long getScalarHandle() { + return offHeap.scalarHandle; + } + /** * Increment the reference count for this scalar. You need to call close on this * to decrement the reference count again. @@ -542,10 +557,6 @@ public synchronized Scalar incRefCount() { return this; } - long getScalarHandle() { - return offHeap.scalarHandle; - } - /** * Free the memory associated with a scalar. */ diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index efe96ff6c3e..427ffcc8c12 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,9 +12,8 @@ # the License. # ============================================================================= -set(cython_sources - column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx - sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx +set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx stream_compaction.pyx + string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) @@ -30,6 +29,3 @@ target_include_directories(interop PUBLIC "$= -nrows and gm_max < nrows @@ -358,14 +354,13 @@ class PackedColumns(Serializable): header["index-names"] = self.index_names header["metadata"] = self._metadata.tobytes() for name, dtype in self.column_dtypes.items(): - dtype_header, dtype_frames = dtype.serialize() + dtype_header, dtype_frames = dtype.device_serialize() self.column_dtypes[name] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) header["column-dtypes"] = self.column_dtypes - header["type-serialized"] = pickle.dumps(type(self)) return header, frames @classmethod @@ -373,9 +368,9 @@ class PackedColumns(Serializable): column_dtypes = {} for name, dtype in header["column-dtypes"].items(): dtype_header, (start, stop) = dtype - column_dtypes[name] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize(dtype_header, frames[start:stop]) + column_dtypes[name] = Serializable.device_deserialize( + dtype_header, frames[start:stop] + ) return cls( plc.contiguous_split.pack( plc.contiguous_split.unpack_from_memoryviews( diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx deleted file mode 100644 index 641fc18c203..00000000000 --- a/python/cudf/cudf/_lib/csv.pyx +++ /dev/null @@ -1,414 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -cimport pylibcudf.libcudf.types as libcudf_types - -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import errno -import os -from collections import abc -from io import BytesIO, StringIO - -import numpy as np -import pandas as pd - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.utils cimport data_from_pylibcudf_io - -import pylibcudf as plc - -from cudf.api.types import is_hashable - -from pylibcudf.types cimport DataType - -CSV_HEX_TYPE_MAP = { - "hex": np.dtype("int64"), - "hex64": np.dtype("int64"), - "hex32": np.dtype("int32") -} - - -def validate_args( - object delimiter, - object sep, - bool delim_whitespace, - object decimal, - object thousands, - object nrows, - int skipfooter, - object byte_range, - int skiprows -): - if delim_whitespace: - if delimiter is not None: - raise ValueError("cannot set both delimiter and delim_whitespace") - if sep != ',': - raise ValueError("cannot set both sep and delim_whitespace") - - # Alias sep -> delimiter. - actual_delimiter = delimiter if delimiter else sep - - if decimal == actual_delimiter: - raise ValueError("decimal cannot be the same as delimiter") - - if thousands == actual_delimiter: - raise ValueError("thousands cannot be the same as delimiter") - - if nrows is not None and skipfooter != 0: - raise ValueError("cannot use both nrows and skipfooter parameters") - - if byte_range is not None: - if skipfooter != 0 or skiprows != 0 or nrows is not None: - raise ValueError("""cannot manually limit rows to be read when - using the byte range parameter""") - - -def read_csv( - object datasource, - object lineterminator="\n", - object quotechar='"', - int quoting=0, - bool doublequote=True, - object header="infer", - bool mangle_dupe_cols=True, - object usecols=None, - object sep=",", - object delimiter=None, - bool delim_whitespace=False, - bool skipinitialspace=False, - object names=None, - object dtype=None, - int skipfooter=0, - int skiprows=0, - bool dayfirst=False, - object compression="infer", - object thousands=None, - object decimal=".", - object true_values=None, - object false_values=None, - object nrows=None, - object byte_range=None, - bool skip_blank_lines=True, - object parse_dates=None, - object comment=None, - object na_values=None, - bool keep_default_na=True, - bool na_filter=True, - object prefix=None, - object index_col=None, -): - """ - Cython function to call into libcudf API, see `read_csv`. - - See Also - -------- - cudf.read_csv - """ - - if not isinstance(datasource, (BytesIO, StringIO, bytes)): - if not os.path.isfile(datasource): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), datasource - ) - - if isinstance(datasource, StringIO): - datasource = datasource.read().encode() - elif isinstance(datasource, str) and not os.path.isfile(datasource): - datasource = datasource.encode() - - validate_args(delimiter, sep, delim_whitespace, decimal, thousands, - nrows, skipfooter, byte_range, skiprows) - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - delimiter = str(delimiter) - - if byte_range is None: - byte_range = (0, 0) - - if compression is None: - c_compression = plc.io.types.CompressionType.NONE - else: - compression_map = { - "infer": plc.io.types.CompressionType.AUTO, - "gzip": plc.io.types.CompressionType.GZIP, - "bz2": plc.io.types.CompressionType.BZIP2, - "zip": plc.io.types.CompressionType.ZIP, - } - c_compression = compression_map[compression] - - # We need this later when setting index cols - orig_header = header - - if names is not None: - # explicitly mentioned name, so don't check header - if header is None or header == 'infer': - header = -1 - else: - header = header - names = list(names) - else: - if header is None: - header = -1 - elif header == 'infer': - header = 0 - - hex_cols = [] - - new_dtypes = [] - if dtype is not None: - if isinstance(dtype, abc.Mapping): - new_dtypes = dict() - for k, v in dtype.items(): - col_type = v - if is_hashable(v) and v in CSV_HEX_TYPE_MAP: - col_type = CSV_HEX_TYPE_MAP[v] - hex_cols.append(str(k)) - - new_dtypes[k] = _get_plc_data_type_from_dtype( - cudf.dtype(col_type) - ) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP: - dtype = CSV_HEX_TYPE_MAP[dtype] - hex_cols.append(0) - - new_dtypes.append( - _get_plc_data_type_from_dtype(dtype) - ) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP: - col_dtype = CSV_HEX_TYPE_MAP[col_dtype] - hex_cols.append(index) - - new_dtypes.append( - _get_plc_data_type_from_dtype(col_dtype) - ) - else: - raise ValueError( - "dtype should be a scalar/str/list-like/dict-like" - ) - options = ( - plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) - .compression(c_compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(byte_range[0]) - .byte_range_size(byte_range[1]) - .nrows(nrows if nrows is not None else -1) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(quoting) - .lineterminator(str(lineterminator)) - .quotechar(quotechar) - .decimal(decimal) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - options.set_header(header) - - if names is not None: - options.set_names([str(name) for name in names]) - - if prefix is not None: - options.set_prefix(prefix) - - if usecols is not None: - if all(isinstance(col, int) for col in usecols): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name) for name in usecols]) - - if delimiter is not None: - options.set_delimiter(delimiter) - - if thousands is not None: - options.set_thousands(thousands) - - if comment is not None: - options.set_comment(comment) - - if parse_dates is not None: - options.set_parse_dates(list(parse_dates)) - - if hex_cols is not None: - options.set_parse_hex(list(hex_cols)) - - options.set_dtypes(new_dtypes) - - if true_values is not None: - options.set_true_values([str(val) for val in true_values]) - - if false_values is not None: - options.set_false_values([str(val) for val in false_values]) - - if na_values is not None: - options.set_na_values([str(val) for val in na_values]) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) - ) - - if dtype is not None: - if isinstance(dtype, abc.Mapping): - for k, v in dtype.items(): - if isinstance(cudf.dtype(v), cudf.CategoricalDtype): - df._data[str(k)] = df._data[str(k)].astype(v) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): - df = df.astype(dtype) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._column_names[index] - df._data[col_name] = df._data[col_name].astype(col_dtype) - - if names is not None and len(names) and isinstance(names[0], int): - df.columns = [int(x) for x in df._data] - elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"): - df.columns = [int(x) for x in df._column_names] - - # Set index if the index_col parameter is passed - if index_col is not None and index_col is not False: - if isinstance(index_col, int): - index_col_name = df._data.get_labels_by_index(index_col)[0] - df = df.set_index(index_col_name) - if isinstance(index_col_name, str) and \ - names is None and orig_header == "infer": - if index_col_name.startswith("Unnamed:"): - # TODO: Try to upstream it to libcudf - # csv reader in future - df._index.name = None - elif names is None: - df._index.name = index_col - else: - df = df.set_index(index_col) - - return df - - -@acquire_spill_lock() -def write_csv( - table, - object path_or_buf=None, - object sep=",", - object na_rep="", - bool header=True, - object lineterminator="\n", - int rows_per_chunk=8, - bool index=True, -): - """ - Cython function to call into libcudf API, see `write_csv`. - - See Also - -------- - cudf.to_csv - """ - index_and_not_empty = index is True and table.index is not None - columns = [ - col.to_pylibcudf(mode="read") for col in table.index._columns - ] if index_and_not_empty else [] - columns.extend(col.to_pylibcudf(mode="read") for col in table._columns) - col_names = [] - if header: - all_names = list(table.index.names) if index_and_not_empty else [] - all_names.extend( - na_rep if name is None or pd.isnull(name) - else name for name in table._column_names - ) - col_names = [ - '""' if (name in (None, '') and len(all_names) == 1) - else (str(name) if name not in (None, '') else '') - for name in all_names - ] - try: - plc.io.csv.write_csv( - ( - plc.io.csv.CsvWriterOptions.builder( - plc.io.SinkInfo([path_or_buf]), plc.Table(columns) - ) - .names(col_names) - .na_rep(na_rep) - .include_header(header) - .rows_per_chunk(rows_per_chunk) - .line_terminator(str(lineterminator)) - .inter_column_delimiter(str(sep)) - .true_value("True") - .false_value("False") - .build() - ) - ) - except OverflowError: - raise OverflowError( - f"Writing CSV file with chunksize={rows_per_chunk} failed. " - "Consider providing a smaller chunksize argument." - ) - - -cdef DataType _get_plc_data_type_from_dtype(object dtype) except *: - # TODO: Remove this work-around Dictionary types - # in libcudf are fully mapped to categorical columns: - # https://github.com/rapidsai/cudf/issues/3960 - if isinstance(dtype, cudf.CategoricalDtype): - dtype = dtype.categories.dtype - elif dtype == "category": - dtype = "str" - - if isinstance(dtype, str): - if str(dtype) == "date32": - return DataType( - libcudf_types.type_id.TIMESTAMP_DAYS - ) - elif str(dtype) in ("date", "date64"): - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[us]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MICROSECONDS - ) - elif str(dtype) == "timestamp[s]": - return DataType( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - elif str(dtype) == "timestamp[ms]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[ns]": - return DataType( - libcudf_types.type_id.TIMESTAMP_NANOSECONDS - ) - - dtype = cudf.dtype(dtype) - return dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt deleted file mode 100644 index e7408cf2852..00000000000 --- a/python/cudf/cudf/_lib/io/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources utils.pyx) -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/io/__init__.pxd b/python/cudf/cudf/_lib/io/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/__init__.py b/python/cudf/cudf/_lib/io/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd deleted file mode 100644 index 9b8bab012e2..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.vector cimport vector - -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport ( - column_name_info, - sink_info, - source_info, -) - -from cudf._lib.column cimport Column - - -cdef add_df_col_struct_names( - df, - child_names_dict -) -cdef update_col_struct_field_names( - Column col, - child_names -) -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info -) -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx deleted file mode 100644 index df4675be599..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - - -from libcpp.string cimport string - -from libcpp.vector cimport vector - -from pylibcudf.libcudf.io.types cimport column_name_info - -from cudf._lib.column cimport Column - -from cudf.core.dtypes import StructDtype - -cdef add_df_col_struct_names(df, child_names_dict): - for name, child_names in child_names_dict.items(): - col = df._data[name] - - df._data[name] = update_col_struct_field_names(col, child_names) - - -cdef update_col_struct_field_names(Column col, child_names): - if col.children: - children = list(col.children) - for i, (child, names) in enumerate(zip(children, child_names.values())): - children[i] = update_col_struct_field_names( - child, - names - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - col = col._rename_fields( - child_names.keys() - ) - - return col - - -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info -): - # Deprecated, remove in favor of add_col_struct_names - # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._column_labels_and_values): - table._data[name] = update_column_struct_field_names( - col, schema_info[i] - ) - - -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -): - cdef vector[string] field_names - - if col.children: - children = list(col.children) - for i, child in enumerate(children): - children[i] = update_column_struct_field_names( - child, - info.children[i] - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - field_names.reserve(len(col.base_children)) - for i in range(info.children.size()): - field_names.push_back(info.children[i].name) - col = col._rename_fields( - field_names - ) - - return col diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt deleted file mode 100644 index 22ec5d472f2..00000000000 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources - byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx - ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx -) -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/nvtext/__init__.pxd b/python/cudf/cudf/_lib/nvtext/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/nvtext/__init__.py b/python/cudf/cudf/_lib/nvtext/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx deleted file mode 100644 index 2b2762eead2..00000000000 --- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext -from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs # no-cython-lint - - -@acquire_spill_lock() -def byte_pair_encoding( - Column strings, - object merge_pairs, - object separator -): - return Column.from_pylibcudf( - nvtext.byte_pair_encode.byte_pair_encoding( - strings.to_pylibcudf(mode="read"), - merge_pairs, - separator.device_value.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx deleted file mode 100644 index 3dd99c42d76..00000000000 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf cimport nvtext - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def edit_distance(Column strings, Column targets): - result = nvtext.edit_distance.edit_distance( - strings.to_pylibcudf(mode="read"), - targets.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def edit_distance_matrix(Column strings): - result = nvtext.edit_distance.edit_distance_matrix( - strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx deleted file mode 100644 index 7fdf9258b7f..00000000000 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def generate_ngrams(Column strings, int ngrams, object py_separator): - result = nvtext.generate_ngrams.generate_ngrams( - strings.to_pylibcudf(mode="read"), - ngrams, - py_separator.device_value.c_value - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def generate_character_ngrams(Column strings, int ngrams): - result = nvtext.generate_ngrams.generate_character_ngrams( - strings.to_pylibcudf(mode="read"), - ngrams - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def hash_character_ngrams(Column strings, int ngrams): - result = nvtext.generate_ngrams.hash_character_ngrams( - strings.to_pylibcudf(mode="read"), - ngrams - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx deleted file mode 100644 index c964d0206b7..00000000000 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def jaccard_index(Column input1, Column input2, int width): - result = nvtext.jaccard.jaccard_index( - input1.to_pylibcudf(mode="read"), - input2.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx deleted file mode 100644 index 9f2b3f92502..00000000000 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t, uint64_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def minhash(Column input, uint32_t seed, Column a, Column b, int width): - return Column.from_pylibcudf( - nvtext.minhash.minhash( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, - ) - ) - - -@acquire_spill_lock() -def minhash64(Column input, uint64_t seed, Column a, Column b, int width): - return Column.from_pylibcudf( - nvtext.minhash.minhash64( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx deleted file mode 100644 index c125d92a24e..00000000000 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def ngrams_tokenize( - Column input, - int ngrams, - object py_delimiter, - object py_separator -): - return Column.from_pylibcudf( - nvtext.ngrams_tokenize.ngrams_tokenize( - input.to_pylibcudf(mode="read"), - ngrams, - py_delimiter.device_value.c_value, - py_separator.device_value.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx deleted file mode 100644 index cc45123dd0a..00000000000 --- a/python/cudf/cudf/_lib/nvtext/normalize.pyx +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def normalize_spaces(Column input): - return Column.from_pylibcudf( - nvtext.normalize.normalize_spaces( - input.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def normalize_characters(Column input, bool do_lower=True): - return Column.from_pylibcudf( - nvtext.normalize.normalize_characters( - input.to_pylibcudf(mode="read"), - do_lower, - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx deleted file mode 100644 index bec56ade83c..00000000000 --- a/python/cudf/cudf/_lib/nvtext/replace.pyx +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from pylibcudf import nvtext - - -@acquire_spill_lock() -def replace_tokens(Column strings, - Column targets, - Column replacements, - object py_delimiter): - """ - The `targets` tokens are searched for within each `strings` - in the Column and replaced with the corresponding `replacements` - if found. Tokens are identified by the `py_delimiter` character - provided. - """ - - return Column.from_pylibcudf( - nvtext.replace.replace_tokens( - strings.to_pylibcudf(mode="read"), - targets.to_pylibcudf(mode="read"), - replacements.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - ) - ) - - -@acquire_spill_lock() -def filter_tokens(Column strings, - size_type min_token_length, - object py_replacement, - object py_delimiter): - """ - Tokens smaller than `min_token_length` are removed from `strings` - in the Column and optionally replaced with the corresponding - `py_replacement` string. Tokens are identified by the `py_delimiter` - character provided. - """ - - return Column.from_pylibcudf( - nvtext.replace.filter_tokens( - strings.to_pylibcudf(mode="read"), - min_token_length, - py_replacement.device_value.c_value, - py_delimiter.device_value.c_value, - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx deleted file mode 100644 index 63a389b64d5..00000000000 --- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from enum import IntEnum - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.nvtext.stemmer cimport ( - letter_type, - underlying_type_t_letter_type, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -class LetterType(IntEnum): - CONSONANT = letter_type.CONSONANT - VOWEL = letter_type.VOWEL - - -@acquire_spill_lock() -def porter_stemmer_measure(Column strings): - return Column.from_pylibcudf( - nvtext.stemmer.porter_stemmer_measure( - strings.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def is_letter(Column strings, - object ltype, - size_type index): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - index, - ) - ) - - -@acquire_spill_lock() -def is_letter_multi(Column strings, - object ltype, - Column indices): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - indices.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx deleted file mode 100644 index 5e0bfb74705..00000000000 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def subword_tokenize_inmem_hash( - Column strings, - object hashed_vocabulary, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, -): - """ - Subword tokenizes text series by using the pre-loaded hashed vocabulary - """ - result = nvtext.subword_tokenize.subword_tokenize( - strings.to_pylibcudf(mode="read"), - hashed_vocabulary, - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - # return the 3 tensor components - tokens = Column.from_pylibcudf(result[0]) - masks = Column.from_pylibcudf(result[1]) - metadata = Column.from_pylibcudf(result[2]) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx deleted file mode 100644 index f473c48e2f7..00000000000 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def _tokenize_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _tokenize_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def _count_tokens_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _count_tokens_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def character_tokenize(Column strings): - return Column.from_pylibcudf( - nvtext.tokenize.character_tokenize( - strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def detokenize(Column strings, Column indices, object py_separator): - return Column.from_pylibcudf( - nvtext.tokenize.detokenize( - strings.to_pylibcudf(mode="read"), - indices.to_pylibcudf(mode="read"), - py_separator.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def tokenize_with_vocabulary(Column strings, - object vocabulary, - object py_delimiter, - size_type default_id): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_with_vocabulary( - strings.to_pylibcudf(mode="read"), - vocabulary, - py_delimiter.device_value.c_value, - default_id - ) - ) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx deleted file mode 100644 index 00c434ae374..00000000000 --- a/python/cudf/cudf/_lib/parquet.pyx +++ /dev/null @@ -1,817 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import io - -import pyarrow as pa -import itertools -import cudf -from cudf.core.buffer import acquire_spill_lock - -try: - import ujson as json -except ImportError: - import json - -import numpy as np - -from cudf.api.types import is_list_like - -from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io - -from cudf._lib.utils import _index_level_name, generate_pandas_metadata - -from libc.stdint cimport int64_t -from libcpp cimport bool - -from pylibcudf.expressions cimport Expression -from pylibcudf.io.parquet cimport ChunkedParquetReader -from pylibcudf.libcudf.io.types cimport ( - statistics_freq, - compression_type, - dictionary_policy, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport ( - add_df_col_struct_names, -) - -import pylibcudf as plc - -from pylibcudf cimport Table - -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT -from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata -from pylibcudf.io.parquet cimport ParquetChunkedWriter - - -def _parse_metadata(meta): - file_is_range_index = False - file_index_cols = None - file_column_dtype = None - - if 'index_columns' in meta and len(meta['index_columns']) > 0: - file_index_cols = meta['index_columns'] - - if isinstance(file_index_cols[0], dict) and \ - file_index_cols[0]['kind'] == 'range': - file_is_range_index = True - if 'column_indexes' in meta and len(meta['column_indexes']) == 1: - file_column_dtype = meta['column_indexes'][0]["numpy_type"] - return file_is_range_index, file_index_cols, file_column_dtype - - -cdef object _process_metadata(object df, - list names, - dict child_names, - list per_file_user_data, - object row_groups, - object filepaths_or_buffers, - bool allow_range_index, - bool use_pandas_metadata, - size_type nrows=-1, - int64_t skip_rows=0, - ): - - add_df_col_struct_names(df, child_names) - index_col = None - is_range_index = True - column_index_type = None - index_col_names = None - meta = None - for single_file in per_file_user_data: - if b'pandas' not in single_file: - continue - json_str = single_file[b'pandas'].decode('utf-8') - meta = json.loads(json_str) - file_is_range_index, index_col, column_index_type = _parse_metadata(meta) - is_range_index &= file_is_range_index - - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] - - if meta is not None: - # Book keep each column metadata as the order - # of `meta["columns"]` and `column_names` are not - # guaranteed to be deterministic and same always. - meta_data_per_column = { - col_meta['name']: col_meta for col_meta in meta["columns"] - } - - # update the decimal precision of each column - for col in names: - if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): - df._data[col].dtype.precision = ( - meta_data_per_column[col]["metadata"]["precision"] - ) - - # Set the index column - if index_col is not None and len(index_col) > 0: - if is_range_index: - if not allow_range_index: - return df - - if len(per_file_user_data) > 1: - range_index_meta = { - "kind": "range", - "name": None, - "start": 0, - "stop": len(df), - "step": 1 - } - else: - range_index_meta = index_col[0] - - if row_groups is not None: - per_file_metadata = [ - pa.parquet.read_metadata( - # Pyarrow cannot read directly from bytes - io.BytesIO(s) if isinstance(s, bytes) else s - ) for s in filepaths_or_buffers - ] - - filtered_idx = [] - for i, file_meta in enumerate(per_file_metadata): - row_groups_i = [] - start = 0 - for row_group in range(file_meta.num_row_groups): - stop = start + file_meta.row_group(row_group).num_rows - row_groups_i.append((start, stop)) - start = stop - - for rg in row_groups[i]: - filtered_idx.append( - cudf.RangeIndex( - start=row_groups_i[rg][0], - stop=row_groups_i[rg][1], - step=range_index_meta['step'] - ) - ) - - if len(filtered_idx) > 0: - idx = cudf.concat(filtered_idx) - else: - idx = cudf.Index._from_column(cudf.core.column.column_empty(0)) - else: - start = range_index_meta["start"] + skip_rows - stop = range_index_meta["stop"] - if nrows > -1: - stop = start + nrows - idx = cudf.RangeIndex( - start=start, - stop=stop, - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - - df._index = idx - elif set(index_col).issubset(names): - index_data = df[index_col] - actual_index_names = iter(index_col_names.values()) - if index_data._num_columns == 1: - idx = cudf.Index._from_column( - index_data._columns[0], - name=next(actual_index_names) - ) - else: - idx = cudf.MultiIndex.from_frame( - index_data, - names=list(actual_index_names) - ) - df.drop(columns=index_col, inplace=True) - df._index = idx - else: - if use_pandas_metadata: - df.index.names = index_col - - if df._num_columns == 0 and column_index_type is not None: - df._data.label_dtype = cudf.dtype(column_index_type) - - return df - - -def read_parquet_chunked( - filepaths_or_buffers, - columns=None, - row_groups=None, - use_pandas_metadata=True, - size_t chunk_read_limit=0, - size_t pass_read_limit=1024000000, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False -): - # Note: If this function ever takes accepts filters - # allow_range_index needs to be False when a filter is passed - # (see read_parquet) - allow_range_index = columns is not None and len(columns) != 0 - - options = ( - plc.io.parquet.ParquetReaderOptions.builder( - plc.io.SourceInfo(filepaths_or_buffers) - ) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .build() - ) - if row_groups is not None: - options.set_row_groups(row_groups) - if nrows > -1: - options.set_num_rows(nrows) - if skip_rows != 0: - options.set_skip_rows(skip_rows) - if columns is not None: - options.set_columns(columns) - - reader = ChunkedParquetReader( - options, - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - ) - - tbl_w_meta = reader.read_chunk() - column_names = tbl_w_meta.column_names(include_children=False) - child_names = tbl_w_meta.child_names - per_file_user_data = tbl_w_meta.per_file_user_data - concatenated_columns = tbl_w_meta.tbl.columns() - - # save memory - del tbl_w_meta - - cdef Table tbl - while reader.has_next(): - tbl = reader.read_chunk().tbl - - for i in range(tbl.num_columns()): - concatenated_columns[i] = plc.concatenate.concatenate( - [concatenated_columns[i], tbl._columns[i]] - ) - # Drop residual columns to save memory - tbl._columns[i] = None - - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns], - column_names=column_names, - index_names=None - ) - ) - df = _process_metadata(df, column_names, child_names, - per_file_user_data, row_groups, - filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - - -cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - use_pandas_metadata=True, - Expression filters=None, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False): - """ - Cython function to call into libcudf API, see `read_parquet`. - - filters, if not None, should be an Expression that evaluates to a - boolean predicate as a function of columns being read. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - - allow_range_index = True - if columns is not None and len(columns) == 0 or filters: - allow_range_index = False - - options = ( - plc.io.parquet.ParquetReaderOptions.builder( - plc.io.SourceInfo(filepaths_or_buffers) - ) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .build() - ) - if row_groups is not None: - options.set_row_groups(row_groups) - if nrows > -1: - options.set_num_rows(nrows) - if skip_rows != 0: - options.set_skip_rows(skip_rows) - if columns is not None: - options.set_columns(columns) - if filters is not None: - options.set_filter(filters) - - tbl_w_meta = plc.io.parquet.read_parquet(options) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(tbl_w_meta) - ) - - df = _process_metadata(df, tbl_w_meta.column_names(include_children=False), - tbl_w_meta.child_names, tbl_w_meta.per_file_user_data, - row_groups, filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - -cpdef read_parquet_metadata(list filepaths_or_buffers): - """ - Cython function to call into libcudf API, see `read_parquet_metadata`. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( - plc.io.SourceInfo(filepaths_or_buffers) - ) - - # read all column names including index column, if any - col_names = [info.name() for info in parquet_metadata.schema().root().children()] - - index_col_names = set() - json_str = parquet_metadata.metadata()['pandas'] - if json_str != "": - meta = json.loads(json_str) - file_is_range_index, index_col, _ = _parse_metadata(meta) - if ( - not file_is_range_index - and index_col is not None - ): - columns = meta['columns'] - for idx_col in index_col: - for c in columns: - if c['field_name'] == idx_col: - index_col_names.add(idx_col) - - # remove the index column from the list of column names - # only if index_col_names is not None - if len(index_col_names) >= 0: - col_names = [name for name in col_names if name not in index_col_names] - - return ( - parquet_metadata.num_rows(), - parquet_metadata.num_rowgroups(), - col_names, - len(col_names), - parquet_metadata.rowgroup_metadata() - ) - - -@acquire_spill_lock() -def write_parquet( - table, - object filepaths_or_buffers, - object index=None, - object compression="snappy", - object statistics="ROWGROUP", - object metadata_file_path=None, - object int96_timestamps=False, - object row_group_size_bytes=None, - object row_group_size_rows=None, - object max_page_size_bytes=None, - object max_page_size_rows=None, - object max_dictionary_size=None, - object partitions_info=None, - object force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, - write_arrow_schema=False, -): - """ - Cython function to call into libcudf API, see `write_parquet`. - - See Also - -------- - cudf.io.parquet.write_parquet - """ - if index is True or ( - index is None and not isinstance(table._index, cudf.RangeIndex) - ): - columns = [*table.index._columns, *table._columns] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - _index_level_name(idx_name, level, table._column_names) - ) - num_index_cols_meta = len(table._index.names) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - tbl_meta = TableInputMetadata(plc_table) - num_index_cols_meta = 0 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - if not isinstance(name, str): - if cudf.get_option("mode.pandas_compatible"): - tbl_meta.column_metadata[i].set_name(str(name)) - else: - raise ValueError( - "Writing a Parquet file requires string column names" - ) - else: - tbl_meta.column_metadata[i].set_name(name) - - _set_col_metadata( - table[name]._column, - tbl_meta.column_metadata[i], - force_nullable_schema, - None, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - if partitions_info is not None: - user_data = [ - {"pandas": generate_pandas_metadata( - table.iloc[start_row:start_row + num_row].copy(deep=False), - index - )} - for start_row, num_row in partitions_info - ] - else: - user_data = [{"pandas": generate_pandas_metadata(table, index)}] - - if header_version not in ("1.0", "2.0"): - raise ValueError( - f"Invalid parquet header version: {header_version}. " - "Valid values are '1.0' and '2.0'" - ) - - dict_policy = ( - plc.io.types.DictionaryPolicy.ADAPTIVE - if use_dictionary - else plc.io.types.DictionaryPolicy.NEVER - ) - - comp_type = _get_comp_type(compression) - stat_freq = _get_stat_freq(statistics) - options = ( - plc.io.parquet.ParquetWriterOptions.builder( - plc.io.SinkInfo(filepaths_or_buffers), plc_table - ) - .metadata(tbl_meta) - .key_value_metadata(user_data) - .compression(comp_type) - .stats_level(stat_freq) - .int96_timestamps(int96_timestamps) - .write_v2_headers(header_version == "2.0") - .dictionary_policy(dict_policy) - .utc_timestamps(False) - .write_arrow_schema(write_arrow_schema) - .build() - ) - if partitions_info is not None: - options.set_partitions( - [plc.io.types.PartitionInfo(part[0], part[1]) for part in partitions_info] - ) - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - options.set_column_chunks_file_paths(metadata_file_path) - else: - options.set_column_chunks_file_paths([metadata_file_path]) - if row_group_size_bytes is not None: - options.set_row_group_size_bytes(row_group_size_bytes) - if row_group_size_rows is not None: - options.set_row_group_size_rows(row_group_size_rows) - if max_page_size_bytes is not None: - options.set_max_page_size_bytes(max_page_size_bytes) - if max_page_size_rows is not None: - options.set_max_page_size_rows(max_page_size_rows) - if max_dictionary_size is not None: - options.set_max_dictionary_size(max_dictionary_size) - blob = plc.io.parquet.write_parquet(options) - if metadata_file_path is not None: - return np.asarray(blob.obj) - else: - return None - - -cdef class ParquetWriter: - """ - ParquetWriter lets you incrementally write out a Parquet file from a series - of cudf tables - - Parameters - ---------- - filepath_or_buffer : str, io.IOBase, os.PathLike, or list - File path or buffer to write to. The argument may also correspond - to a list of file paths or buffers. - index : bool or None, default None - If ``True``, include a dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. If ``None``, - index(es) other than RangeIndex will be saved as columns. - compression : {'snappy', None}, default 'snappy' - Name of the compression to use. Use ``None`` for no compression. - statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' - Level at which column statistics should be included in file. - row_group_size_bytes: int, default ``uint64 max`` - Maximum size of each stripe of the output. - By default, a virtually infinite size equal to ``uint64 max`` will be used. - row_group_size_rows: int, default 1000000 - Maximum number of rows of each stripe of the output. - By default, 1000000 (10^6 rows) will be used. - max_page_size_bytes: int, default 524288 - Maximum uncompressed size of each page of the output. - By default, 524288 (512KB) will be used. - max_page_size_rows: int, default 20000 - Maximum number of rows of each page of the output. - By default, 20000 will be used. - max_dictionary_size: int, default 1048576 - Maximum size of the dictionary page for each output column chunk. Dictionary - encoding for column chunks that exceeds this limit will be disabled. - By default, 1048576 (1MB) will be used. - use_dictionary : bool, default True - If ``True``, enable dictionary encoding for Parquet page data - subject to ``max_dictionary_size`` constraints. - If ``False``, disable dictionary encoding for Parquet page data. - store_schema : bool, default False - If ``True``, enable computing and writing arrow schema to Parquet - file footer's key-value metadata section for faithful round-tripping. - See Also - -------- - cudf.io.parquet.write_parquet - """ - cdef bool initialized - cdef ParquetChunkedWriter writer - cdef SinkInfo sink - cdef TableInputMetadata tbl_meta - cdef str statistics - cdef object compression - cdef object index - cdef size_t row_group_size_bytes - cdef size_type row_group_size_rows - cdef size_t max_page_size_bytes - cdef size_type max_page_size_rows - cdef size_t max_dictionary_size - cdef bool use_dictionary - cdef bool write_arrow_schema - - def __cinit__(self, object filepath_or_buffer, object index=None, - object compression="snappy", str statistics="ROWGROUP", - size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - size_type row_group_size_rows=1000000, - size_t max_page_size_bytes=524288, - size_type max_page_size_rows=20000, - size_t max_dictionary_size=1048576, - bool use_dictionary=True, - bool store_schema=False): - filepaths_or_buffers = ( - list(filepath_or_buffer) - if is_list_like(filepath_or_buffer) - else [filepath_or_buffer] - ) - self.sink = plc.io.SinkInfo(filepaths_or_buffers) - self.statistics = statistics - self.compression = compression - self.index = index - self.initialized = False - self.row_group_size_bytes = row_group_size_bytes - self.row_group_size_rows = row_group_size_rows - self.max_page_size_bytes = max_page_size_bytes - self.max_page_size_rows = max_page_size_rows - self.max_dictionary_size = max_dictionary_size - self.use_dictionary = use_dictionary - self.write_arrow_schema = store_schema - - def write_table(self, table, object partitions_info=None): - """ Writes a single table to the file """ - if not self.initialized: - self._initialize_chunked_state( - table, - num_partitions=len(partitions_info) if partitions_info else 1 - ) - if self.index is not False and ( - table._index.name is not None or - isinstance(table._index, cudf.core.multiindex.MultiIndex)): - columns = [*table.index._columns, *table._columns] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - self.writer.write(plc_table, partitions_info) - - def close(self, object metadata_file_path=None): - if not self.initialized: - return None - column_chunks_file_paths=[] - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - column_chunks_file_paths = list(metadata_file_path) - else: - column_chunks_file_paths = [metadata_file_path] - blob = self.writer.close(column_chunks_file_paths) - if metadata_file_path is not None: - return np.asarray(blob.obj) - return None - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def _initialize_chunked_state(self, table, num_partitions=1): - """ Prepares all the values required to build the - chunked_parquet_writer_options and creates a writer""" - - # Set the table_metadata - num_index_cols_meta = 0 - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in table._columns - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name(idx_name) - num_index_cols_meta = len(table._index.names) - else: - if table._index.name is not None: - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain( - table.index._columns, table._columns - ) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - self.tbl_meta.column_metadata[0].set_name(table._index.name) - num_index_cols_meta = 1 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name) - _set_col_metadata( - table[name]._column, - self.tbl_meta.column_metadata[i], - ) - - index = ( - False if isinstance(table._index, cudf.RangeIndex) else self.index - ) - user_data = [{"pandas" : generate_pandas_metadata(table, index)}]*num_partitions - cdef compression_type comp_type = _get_comp_type(self.compression) - cdef statistics_freq stat_freq = _get_stat_freq(self.statistics) - cdef dictionary_policy dict_policy = ( - plc.io.types.DictionaryPolicy.ADAPTIVE - if self.use_dictionary - else plc.io.types.DictionaryPolicy.NEVER - ) - options = ( - plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(user_data) - .compression(comp_type) - .stats_level(stat_freq) - .row_group_size_bytes(self.row_group_size_bytes) - .row_group_size_rows(self.row_group_size_rows) - .max_page_size_bytes(self.max_page_size_bytes) - .max_page_size_rows(self.max_page_size_rows) - .max_dictionary_size(self.max_dictionary_size) - .write_arrow_schema(self.write_arrow_schema) - .build() - ) - options.set_dictionary_policy(dict_policy) - self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options) - self.initialized = True - - -cpdef merge_filemetadata(object filemetadata_list): - """ - Cython function to call into libcudf API, see `merge_row_group_metadata`. - - See Also - -------- - cudf.io.parquet.merge_row_group_metadata - """ - return np.asarray( - plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj - ) - - -cdef statistics_freq _get_stat_freq(str statistics): - result = getattr( - plc.io.types.StatisticsFreq, - f"STATISTICS_{statistics.upper()}", - None - ) - if result is None: - raise ValueError("Unsupported `statistics_freq` type") - return result - - -cdef compression_type _get_comp_type(object compression): - if compression is None: - return plc.io.types.CompressionType.NONE - result = getattr( - plc.io.types.CompressionType, - str(compression).upper(), - None - ) - if result is None: - raise ValueError("Unsupported `compression` type") - return result - - -cdef _set_col_metadata( - Column col, - ColumnInMetadata col_meta, - bool force_nullable_schema=False, - str path=None, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, -): - need_path = (skip_compression is not None or column_encoding is not None or - column_type_length is not None or output_as_binary is not None) - name = col_meta.get_name() if need_path else None - full_path = path + "." + name if path is not None else name - - if force_nullable_schema: - # Only set nullability if `force_nullable_schema` - # is true. - col_meta.set_nullability(True) - - if skip_compression is not None and full_path in skip_compression: - col_meta.set_skip_compression(True) - - if column_encoding is not None and full_path in column_encoding: - encoding = column_encoding[full_path] - if encoding is None: - c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT - else: - enc = str(encoding).upper() - c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) - if c_encoding is None: - raise ValueError("Unsupported `column_encoding` type") - col_meta.set_encoding(c_encoding) - - if column_type_length is not None and full_path in column_type_length: - col_meta.set_output_as_binary(True) - col_meta.set_type_length(column_type_length[full_path]) - - if output_as_binary is not None and full_path in output_as_binary: - col_meta.set_output_as_binary(True) - - if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - col_meta.child(i).set_name(name) - _set_col_metadata( - child_col, - col_meta.child(i), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.ListDtype): - if full_path is not None: - full_path = full_path + ".list" - col_meta.child(1).set_name("element") - _set_col_metadata( - col.children[1], - col_meta.child(1), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): - col_meta.set_decimal_precision(col.dtype.precision) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx deleted file mode 100644 index 2850cab93a1..00000000000 --- a/python/cudf/cudf/_lib/reduce.pyx +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import warnings - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id - -import pylibcudf - -from cudf.core._internals.aggregation import make_aggregation - - -@acquire_spill_lock() -def reduce(reduction_op, Column incol, dtype=None, **kwargs): - """ - Top level Cython reduce function wrapping libcudf reductions. - - Parameters - ---------- - reduction_op : string - A string specifying the operation, e.g. sum, prod - incol : Column - A cuDF Column object - dtype: numpy.dtype, optional - A numpy data type to use for the output, defaults - to the same type as the input column - """ - if dtype is not None: - warnings.warn( - "dtype is deprecated and will be remove in a future release. " - "Cast the result (e.g. .astype) after the operation instead.", - FutureWarning - ) - col_dtype = dtype - else: - col_dtype = incol._reduction_result_dtype(reduction_op) - - # check empty case - if len(incol) <= incol.null_count: - if reduction_op == 'sum' or reduction_op == 'sum_of_squares': - return incol.dtype.type(0) - if reduction_op == 'product': - return incol.dtype.type(1) - if reduction_op == "any": - return False - - return cudf.utils.dtypes._get_nan_for_dtype(col_dtype) - - result = pylibcudf.reduce.reduce( - incol.to_pylibcudf(mode="read"), - make_aggregation(reduction_op, kwargs).c_obj, - dtype_to_pylibcudf_type(col_dtype), - ) - - if is_decimal_type_id(result.type().id()): - scale = -result.type().scale() - precision = _reduce_precision(col_dtype, reduction_op, len(incol)) - return DeviceScalar.from_pylibcudf( - result, - dtype=col_dtype.__class__(precision, scale), - ).value - scalar = DeviceScalar.from_pylibcudf(result).value - if isinstance(col_dtype, cudf.StructDtype): - # TODO: Utilize column_metadata in libcudf to maintain field labels - return dict(zip(col_dtype.fields.keys(), scalar.values())) - return scalar - - -@acquire_spill_lock() -def scan(scan_op, Column incol, inclusive, **kwargs): - """ - Top level Cython scan function wrapping libcudf scans. - - Parameters - ---------- - incol : Column - A cuDF Column object - scan_op : string - A string specifying the operation, e.g. cumprod - inclusive: bool - Flag for including nulls in relevant scan - """ - return Column.from_pylibcudf( - pylibcudf.reduce.scan( - incol.to_pylibcudf(mode="read"), - make_aggregation(scan_op, kwargs).c_obj, - pylibcudf.reduce.ScanType.INCLUSIVE if inclusive - else pylibcudf.reduce.ScanType.EXCLUSIVE, - ) - ) - - -@acquire_spill_lock() -def minmax(Column incol): - """ - Top level Cython minmax function wrapping libcudf minmax. - - Parameters - ---------- - incol : Column - A cuDF Column object - - Returns - ------- - A pair of ``(min, max)`` values of ``incol`` - """ - min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read")) - return ( - cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)), - cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)), - ) - - -def _reduce_precision(dtype, op, nrows): - """ - Returns the result precision when performing the reduce - operation `op` for the given dtype and column size. - - See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - """ # noqa: E501 - p = dtype.precision - if op in ("min", "max"): - new_p = p - elif op == "sum": - new_p = p + nrows - 1 - elif op == "product": - new_p = p * nrows + nrows - 1 - elif op == "sum_of_squares": - new_p = 2 * p + nrows - else: - raise NotImplementedError() - return max(min(new_p, dtype.MAX_PRECISION), 0) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx deleted file mode 100644 index eefe37d9880..00000000000 --- a/python/cudf/cudf/_lib/sort.pyx +++ /dev/null @@ -1,365 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from itertools import repeat - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from pylibcudf.libcudf.aggregation cimport rank_method -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -@acquire_spill_lock() -def is_sorted( - list source_columns, object ascending=None, object null_position=None -): - """ - Checks whether the rows of a `table` are sorted in lexicographical order. - - Parameters - ---------- - source_columns : list of columns - columns to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order of - each column. If list-like, size of list-like must be len(columns). If - None, all columns expected sort order is set to ascending. False (0) - - descending, True (1) - ascending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of nulls - compared to other elements. If list-like, size of list-like must be - len(columns). If None, null order is set to before. False (0) - after, - True (1) - before. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - - if ascending is None: - column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns) - else: - if len(ascending) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(ascending)} for `ascending`" - ) - column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns) - for idx, val in enumerate(ascending): - if val: - column_order[idx] = pylibcudf.types.Order.ASCENDING - - if null_position is None: - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - else: - if len(null_position) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(null_position)} for `null_position`" - ) - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - for idx, val in enumerate(null_position): - if val: - null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE - - return pylibcudf.sorting.is_sorted( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - column_order, - null_precedence - ) - - -def ordering(column_order, null_precedence): - """ - Construct order and null order vectors - - Parameters - ---------- - column_order - Iterable of bool (True for ascending order, False for descending) - null_precedence - Iterable string for null positions ("first" for start, "last" for end) - - Both iterables must be the same length (not checked) - - Returns - ------- - pair of vectors (order, and null_order) - """ - c_column_order = [] - c_null_precedence = [] - for asc, null in zip(column_order, null_precedence): - c_column_order.append( - pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING - ) - if asc ^ (null == "first"): - c_null_precedence.append(pylibcudf.types.NullOrder.AFTER) - elif asc ^ (null == "last"): - c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE) - else: - raise ValueError(f"Invalid null precedence {null}") - return c_column_order, c_null_precedence - - -@acquire_spill_lock() -def order_by( - list columns_from_table, - object ascending, - str na_position, - *, - bool stable -): - """ - Get index to sort the table in ascending/descending order. - - Parameters - ---------- - columns_from_table : list[Column] - Columns from the table which will be sorted - ascending : sequence[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : str - Whether null values should show up at the "first" or "last" - position of **all** sorted column. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - Column of indices that sorts the table - """ - order = ordering(ascending, repeat(na_position)) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order") - - return Column.from_pylibcudf( - func( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in columns_from_table], - ), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort( - list values, - list column_order=None, - list null_precedence=None, -): - """ - Sort the table in ascending/descending order. - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - return columns_from_pylibcudf_table( - pylibcudf.sorting.sort( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort_by_key( - list values, - list keys, - object ascending, - object na_position, - *, - bool stable, -): - """ - Sort a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - ascending : list[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : list[str] - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - order = ordering(ascending, na_position) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key") - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def segmented_sort_by_key( - list values, - list keys, - Column segment_offsets, - list column_order=None, - list null_precedence=None, - *, - bool stable, -): - """ - Sort segments of a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - offsets : Column - Segment offsets - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - func = getattr( - pylibcudf.sorting, - f"{'stable_' if stable else ''}segmented_sort_by_key" - ) - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - segment_offsets.to_pylibcudf(mode="read"), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def digitize(list source_columns, list bins, bool right=False): - """ - Return the indices of the bins to which each value in source_table belongs. - - Parameters - ---------- - source_columns : Input columns to be binned. - bins : List containing columns of bins - right : Indicating whether the intervals include the - right or the left bin edge. - """ - return Column.from_pylibcudf( - getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in bins] - ), - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - [pylibcudf.types.Order.ASCENDING]*len(bins), - [pylibcudf.types.NullOrder.BEFORE]*len(bins) - ) - ) - - -@acquire_spill_lock() -def rank_columns(list source_columns, rank_method method, str na_option, - bool ascending, bool pct - ): - """ - Compute numerical data ranks (1 through n) of each column in the dataframe - """ - column_order = ( - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ) - # ascending - # #top = na_is_smallest - # #bottom = na_is_largest - # #keep = na_is_largest - # descending - # #top = na_is_largest - # #bottom = na_is_smallest - # #keep = na_is_smallest - if ascending: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.BEFORE - else: - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - null_precedence = pylibcudf.types.NullOrder.BEFORE - c_null_handling = ( - pylibcudf.types.NullPolicy.EXCLUDE - if na_option == 'keep' - else pylibcudf.types.NullPolicy.INCLUDE - ) - - return [ - Column.from_pylibcudf( - pylibcudf.sorting.rank( - col.to_pylibcudf(mode="read"), - method, - column_order, - c_null_handling, - null_precedence, - pct, - ) - ) - for col in source_columns - ] diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py deleted file mode 100644 index b9095a22a42..00000000000 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams, - generate_ngrams, - hash_character_ngrams, -) -from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import ( - minhash, - minhash64, -) -from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize -from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces -from cudf._lib.nvtext.replace import filter_tokens, replace_tokens -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter, - is_letter_multi, - porter_stemmer_measure, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column, - _count_tokens_scalar, - _tokenize_column, - _tokenize_scalar, - character_tokenize, - detokenize, - tokenize_with_vocabulary, -) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2df154ee112..1b6152b81ca 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2,7 +2,6 @@ from __future__ import annotations -import pickle import warnings from functools import cached_property from typing import TYPE_CHECKING, Any, Literal @@ -330,13 +329,6 @@ def get_level_values(self, level): else: raise KeyError(f"Requested level with name {level} " "not found") - @classmethod - def deserialize(cls, header, frames): - # Dispatch deserialization to the appropriate index type in case - # deserialization is ever attempted with the base class directly. - idx_type = pickle.loads(header["type-serialized"]) - return idx_type.deserialize(header, frames) - @property def names(self): """ diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py new file mode 100644 index 00000000000..69f9e7664b1 --- /dev/null +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -0,0 +1,205 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +import itertools +from typing import TYPE_CHECKING, Literal + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from collections.abc import Iterable + + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def is_sorted( + source_columns: list[ColumnBase], + ascending: list[bool] | None = None, + null_position: list[bool] | None = None, +) -> bool: + """ + Checks whether the rows of a `table` are sorted in lexicographical order. + + Parameters + ---------- + source_columns : list of columns + columns to be checked for sort order + ascending : None or list-like of booleans + None or list-like of boolean values indicating expected sort order of + each column. If list-like, size of list-like must be len(columns). If + None, all columns expected sort order is set to ascending. False (0) - + descending, True (1) - ascending. + null_position : None or list-like of booleans + None or list-like of boolean values indicating desired order of nulls + compared to other elements. If list-like, size of list-like must be + len(columns). If None, null order is set to before. False (0) - after, + True (1) - before. + + Returns + ------- + returns : boolean + Returns True, if sorted as expected by ``ascending`` and + ``null_position``, False otherwise. + """ + if ascending is None: + column_order = [plc.types.Order.ASCENDING] * len(source_columns) + else: + if len(ascending) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(ascending)} for `ascending`" + ) + column_order = [ + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + for asc in ascending + ] + + if null_position is None: + null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns) + else: + if len(null_position) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(null_position)} for `null_position`" + ) + null_precedence = [ + plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER + for null in null_position + ] + + return plc.sorting.is_sorted( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + column_order, + null_precedence, + ) + + +def ordering( + column_order: list[bool], + null_precedence: Iterable[Literal["first", "last"]], +) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: + """ + Construct order and null order vectors + + Parameters + ---------- + column_order + Iterable of bool (True for ascending order, False for descending) + null_precedence + Iterable string for null positions ("first" for start, "last" for end) + + Both iterables must be the same length (not checked) + + Returns + ------- + pair of vectors (order, and null_order) + """ + c_column_order = [] + c_null_precedence = [] + for asc, null in zip(column_order, null_precedence): + c_column_order.append( + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + ) + if asc ^ (null == "first"): + c_null_precedence.append(plc.types.NullOrder.AFTER) + elif asc ^ (null == "last"): + c_null_precedence.append(plc.types.NullOrder.BEFORE) + else: + raise ValueError(f"Invalid null precedence {null}") + return c_column_order, c_null_precedence + + +@acquire_spill_lock() +def order_by( + columns_from_table: list[ColumnBase], + ascending: list[bool], + na_position: Literal["first", "last"], + *, + stable: bool, +): + """ + Get index to sort the table in ascending/descending order. + + Parameters + ---------- + columns_from_table : list[Column] + Columns from the table which will be sorted + ascending : sequence[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : str + Whether null values should show up at the "first" or "last" + position of **all** sorted column. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + Column of indices that sorts the table + """ + order = ordering(ascending, itertools.repeat(na_position)) + func = ( + plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order + ) + return Column.from_pylibcudf( + func( + plc.Table( + [col.to_pylibcudf(mode="read") for col in columns_from_table], + ), + order[0], + order[1], + ) + ) + + +@acquire_spill_lock() +def sort_by_key( + values: list[ColumnBase], + keys: list[ColumnBase], + ascending: list[bool], + na_position: list[Literal["first", "last"]], + *, + stable: bool, +) -> list[ColumnBase]: + """ + Sort a table by given keys + + Parameters + ---------- + values : list[Column] + Columns of the table which will be sorted + keys : list[Column] + Columns making up the sort key + ascending : list[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : list[str] + Sequence of "first" or "last" values (default "first") + indicating the position of null values when sorting the keys. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + list[Column] + list of value columns sorted by keys + """ + order = ordering(ascending, na_position) + func = ( + plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + ) + return [ + Column.from_pylibcudf(col) + for col in func( + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + plc.Table([col.to_pylibcudf(mode="read") for col in keys]), + order[0], + order[1], + ).columns() + ] diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index ce6bb83bc77..c8ea03b04fe 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,8 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import pickle - import numpy import cudf @@ -22,6 +20,14 @@ class Serializable: latter converts back from that representation into an equivalent object. """ + # A mapping from class names to the classes themselves. This is used to + # reconstruct the correct class when deserializing an object. + _name_type_map: dict = {} + + def __init_subclass__(cls, /, **kwargs): + super().__init_subclass__(**kwargs) + cls._name_type_map[cls.__name__] = cls + def serialize(self): """Generate an equivalent serializable representation of an object. @@ -98,7 +104,7 @@ def device_serialize(self): ) for f in frames ) - header["type-serialized"] = pickle.dumps(type(self)) + header["type-serialized-name"] = type(self).__name__ header["is-cuda"] = [ hasattr(f, "__cuda_array_interface__") for f in frames ] @@ -128,10 +134,10 @@ def device_deserialize(cls, header, frames): :meta private: """ - typ = pickle.loads(header["type-serialized"]) + typ = cls._name_type_map[header["type-serialized-name"]] frames = [ cudf.core.buffer.as_buffer(f) if c else memoryview(f) - for c, f in zip(header["is-cuda"], frames) + for c, f in zip(header["is-cuda"], frames, strict=True) ] return typ.deserialize(header, frames) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index ffa306bf93f..625938ca168 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -3,7 +3,6 @@ from __future__ import annotations import math -import pickle import weakref from types import SimpleNamespace from typing import TYPE_CHECKING, Any, Literal @@ -432,8 +431,7 @@ def serialize(self) -> tuple[dict, list]: second element is a list containing single frame. """ header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 frames = [self] return header, frames @@ -460,7 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self: if isinstance(frame, cls): return frame # The frame is already deserialized - owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"]) + owner_type: BufferOwner = Serializable._name_type_map[ + header["owner-type-serialized-name"] + ] if hasattr(frame, "__cuda_array_interface__"): owner = owner_type.from_device_memory(frame, exposed=False) else: diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 7305ff651c6..cbb65229933 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -3,7 +3,6 @@ from __future__ import annotations import collections.abc -import pickle import time import weakref from threading import RLock @@ -415,8 +414,7 @@ def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} frames: list[Buffer | memoryview] with self._owner.lock: - header["type-serialized"] = pickle.dumps(self.__class__) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 if self.is_spilled: frames = [self.memoryview()] diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 8d38a5f2272..b49f5154697 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.byte_pair_encode import ( - byte_pair_encoding as cpp_byte_pair_encoding, -) class BytePairEncoder: @@ -25,12 +22,12 @@ class BytePairEncoder: BytePairEncoder """ - def __init__(self, merges_pair: "cudf.Series"): + def __init__(self, merges_pair: cudf.Series) -> None: self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs( merges_pair._column.to_pylibcudf(mode="read") ) - def __call__(self, text, separator: str = " ") -> cudf.Series: + def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: """ Parameters @@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series: dtype: object """ sep = cudf.Scalar(separator, dtype="str") - result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - - return cudf.Series._from_column(result) + return cudf.Series._from_column( + text._column.byte_pair_encoding(self.merge_pairs, sep) + ) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 71ec11e75af..a0cf38c6f51 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1193,7 +1193,7 @@ def _concat( f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: - codes_col = column.column_empty(0, head.codes.dtype, masked=True) + codes_col = column.column_empty(0, head.codes.dtype) else: codes_col = column.concat_columns(codes) # type: ignore[arg-type] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b317858077f..cc07af0f669 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,7 +2,7 @@ from __future__ import annotations -import pickle +import warnings from collections import abc from collections.abc import MutableSequence, Sequence from functools import cached_property @@ -32,7 +32,7 @@ drop_duplicates, drop_nulls, ) -from cudf._lib.types import size_type_dtype +from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -42,7 +42,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals import unary +from cudf.core._internals import aggregation, sorting, unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -260,21 +260,17 @@ def all(self, skipna: bool = True) -> bool: # The skipna argument is only used for numerical columns. # If all entries are null the result is True, including when the column # is empty. - if self.null_count == self.size: return True - - return libcudf.reduce.reduce("all", self) + return self.reduce("all") def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. - if not skipna and self.has_nulls(): return True elif skipna and self.null_count == self.size: return False - - return libcudf.reduce.reduce("any", self) + return self.reduce("any") def dropna(self) -> Self: if self.has_nulls(): @@ -555,7 +551,7 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self: if stop < 0 and not (stride < 0 and stop == -1): stop = stop + len(self) if (stride > 0 and start >= stop) or (stride < 0 and start <= stop): - return cast(Self, column_empty(0, self.dtype, masked=True)) + return cast(Self, column_empty(0, self.dtype)) # compute mask slice if stride == 1: return libcudf.copying.column_slice(self, [start, stop])[ @@ -1000,13 +996,13 @@ def is_unique(self) -> bool: @cached_property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [True], None ) @cached_property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [False], None ) @@ -1030,15 +1026,20 @@ def contains(self, other: ColumnBase) -> ColumnBase: def sort_values( self: Self, ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ) -> Self: if (not ascending and self.is_monotonic_decreasing) or ( ascending and self.is_monotonic_increasing ): return self.copy() - return libcudf.sort.sort( - [self], column_order=[ascending], null_precedence=[na_position] - )[0] + order = sorting.ordering([ascending], [na_position]) + with acquire_spill_lock(): + plc_table = plc.sorting.sort( + plc.Table([self.to_pylibcudf(mode="read")]), + order[0], + order[1], + ) + return type(self).from_pylibcudf(plc_table.columns()[0]) # type: ignore[return-value] def distinct_count(self, dropna: bool = True) -> int: try: @@ -1058,7 +1059,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: if self.dtype == dtype: result = self else: - result = column_empty(0, dtype=dtype, masked=self.nullable) + result = column_empty(0, dtype=dtype) elif dtype == "category": # TODO: Figure out why `cudf.dtype("category")` # astype's different than just the string @@ -1208,7 +1209,7 @@ def argsort( as_column(range(len(self) - 1, -1, -1)), ) else: - return libcudf.sort.order_by( + return sorting.order_by( [self], [ascending], na_position, stable=True ) @@ -1294,28 +1295,27 @@ def serialize(self) -> tuple[dict, list]: header: dict[Any, Any] = {} frames = [] - header["type-serialized"] = pickle.dumps(type(self)) try: - dtype, dtype_frames = self.dtype.serialize() + dtype, dtype_frames = self.dtype.device_serialize() header["dtype"] = dtype frames.extend(dtype_frames) header["dtype-is-cudf-serialized"] = True except AttributeError: - header["dtype"] = pickle.dumps(self.dtype) + header["dtype"] = self.dtype.str header["dtype-is-cudf-serialized"] = False if self.data is not None: - data_header, data_frames = self.data.serialize() + data_header, data_frames = self.data.device_serialize() header["data"] = data_header frames.extend(data_frames) if self.mask is not None: - mask_header, mask_frames = self.mask.serialize() + mask_header, mask_frames = self.mask.device_serialize() header["mask"] = mask_header frames.extend(mask_frames) if self.children: child_headers, child_frames = zip( - *(c.serialize() for c in self.children) + *(c.device_serialize() for c in self.children) ) header["subheaders"] = list(child_headers) frames.extend(chain(*child_frames)) @@ -1327,8 +1327,7 @@ def serialize(self) -> tuple[dict, list]: def deserialize(cls, header: dict, frames: list) -> ColumnBase: def unpack(header, frames) -> tuple[Any, list]: count = header["frame_count"] - klass = pickle.loads(header["type-serialized"]) - obj = klass.deserialize(header, frames[:count]) + obj = cls.device_deserialize(header, frames[:count]) return obj, frames[count:] assert header["frame_count"] == len(frames), ( @@ -1338,7 +1337,7 @@ def unpack(header, frames) -> tuple[Any, list]: if header["dtype-is-cudf-serialized"]: dtype, frames = unpack(header["dtype"], frames) else: - dtype = pickle.loads(header["dtype"]) + dtype = np.dtype(header["dtype"]) if "data" in header: data, frames = unpack(header["data"], frames) else: @@ -1396,33 +1395,35 @@ def _reduce( ) if isinstance(preprocessed, ColumnBase): dtype = kwargs.pop("dtype", None) - return libcudf.reduce.reduce( - op, preprocessed, dtype=dtype, **kwargs - ) + return preprocessed.reduce(op, dtype, **kwargs) return preprocessed + def _can_return_nan(self, skipna: bool | None = None) -> bool: + return not skipna and self.has_nulls(include_nan=False) + def _process_for_reduction( self, skipna: bool | None = None, min_count: int = 0 ) -> ColumnBase | ScalarLike: - if skipna is None: - skipna = True + skipna = True if skipna is None else skipna - if self.has_nulls(): + if self._can_return_nan(skipna=skipna): + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + + col = self.nans_to_nulls() if skipna else self + if col.has_nulls(): if skipna: - result_col = self.dropna() + col = col.dropna() else: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - result_col = self - # TODO: If and when pandas decides to validate that `min_count` >= 0 we # should insert comparable behavior. # https://github.com/pandas-dev/pandas/issues/50022 if min_count > 0: - valid_count = len(result_col) - result_col.null_count + valid_count = len(col) - col.null_count if valid_count < min_count: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - return result_col + return col def _reduction_result_dtype(self, reduction_op: str) -> Dtype: """ @@ -1515,7 +1516,7 @@ def _return_sentinel_column(): del right_rows # reorder `codes` so that its values correspond to the # values of `self`: - (codes,) = libcudf.sort.sort_by_key( + (codes,) = sorting.sort_by_key( codes, [left_gather_map], [True], ["last"], stable=True ) return codes.fillna(na_sentinel.value) @@ -1532,6 +1533,91 @@ def one_hot_encode( for col in plc_table.columns() ) + @acquire_spill_lock() + def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.reduce.scan( + self.to_pylibcudf(mode="read"), + aggregation.make_aggregation(scan_op, kwargs).c_obj, + plc.reduce.ScanType.INCLUSIVE + if inclusive + else plc.reduce.ScanType.EXCLUSIVE, + ) + ) + + def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: + if dtype is not None: + warnings.warn( + "dtype is deprecated and will be remove in a future release. " + "Cast the result (e.g. .astype) after the operation instead.", + FutureWarning, + ) + col_dtype = dtype + else: + col_dtype = self._reduction_result_dtype(reduction_op) + + # check empty case + if len(self) <= self.null_count: + if reduction_op == "sum" or reduction_op == "sum_of_squares": + return self.dtype.type(0) + if reduction_op == "product": + return self.dtype.type(1) + if reduction_op == "any": + return False + + return cudf.utils.dtypes._get_nan_for_dtype(col_dtype) + + with acquire_spill_lock(): + plc_scalar = plc.reduce.reduce( + self.to_pylibcudf(mode="read"), + aggregation.make_aggregation(reduction_op, kwargs).c_obj, + dtype_to_pylibcudf_type(col_dtype), + ) + result_col = type(self).from_pylibcudf( + plc.Column.from_scalar(plc_scalar, 1) + ) + if plc_scalar.type().id() in { + plc.TypeId.DECIMAL128, + plc.TypeId.DECIMAL64, + plc.TypeId.DECIMAL32, + }: + scale = -plc_scalar.type().scale() + # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql + p = col_dtype.precision + nrows = len(self) + if reduction_op in {"min", "max"}: + new_p = p + elif reduction_op == "sum": + new_p = p + nrows - 1 + elif reduction_op == "product": + new_p = p * nrows + nrows - 1 + elif reduction_op == "sum_of_squares": + new_p = 2 * p + nrows + else: + raise NotImplementedError( + f"{reduction_op} not implemented for decimal types." + ) + precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) + new_dtype = type(col_dtype)(precision, scale) + result_col = result_col.astype(new_dtype) + elif isinstance(col_dtype, cudf.IntervalDtype): + result_col = type(self).from_struct_column( # type: ignore[attr-defined] + result_col, closed=col_dtype.closed + ) + return result_col.element_indexing(0) + + @acquire_spill_lock() + def minmax(self) -> tuple[ScalarLike, ScalarLike]: + min_val, max_val = plc.reduce.minmax(self.to_pylibcudf(mode="read")) + return ( + type(self) + .from_pylibcudf(plc.Column.from_scalar(min_val, 1)) + .element_indexing(0), + type(self) + .from_pylibcudf(plc.Column.from_scalar(max_val, 1)) + .element_indexing(0), + ) + def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" @@ -1544,7 +1630,6 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: def column_empty( row_count: int, dtype: Dtype = "object", - masked: bool = False, for_numba: bool = False, ) -> ColumnBase: """ @@ -1561,9 +1646,6 @@ def column_empty( dtype : Dtype Type of the column. - masked : bool - Unused. - for_numba : bool, default False If True, don't allocate a mask as it's not supported by numba. """ @@ -2307,7 +2389,9 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: frames = [] if len(columns) > 0: - header_columns = [c.serialize() for c in columns] + header_columns: list[tuple[dict, list]] = [ + c.device_serialize() for c in columns + ] headers, column_frames = zip(*header_columns) for f in column_frames: frames.extend(f) @@ -2324,7 +2408,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: for meta in headers: col_frame_count = meta["frame_count"] - col_typ = pickle.loads(meta["type-serialized"]) + col_typ = Serializable._name_type_map[meta["type-serialized-name"]] colobj = col_typ.deserialize(meta, frames[:col_frame_count]) columns.append(colobj) # Advance frames @@ -2337,7 +2421,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: dtype = cudf.dtype(None) - return column_empty(0, dtype=dtype, masked=True) + return column_empty(0, dtype=dtype) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. @@ -2384,7 +2468,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: - return column_empty(0, head.dtype, masked=True) + return column_empty(0, head.dtype) # Filter out inputs that have 0 length, then concatenate. objs_with_len = [o for o in objs if len(o)] diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b526a6efa51..81b82040b8d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -598,14 +598,12 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: if len(self) == 0: return cast( cudf.core.column.StringColumn, - column.column_empty(0, dtype="object", masked=False), + column.column_empty(0, dtype="object"), ) if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: - names = cudf.core.column.column_empty( - 0, dtype="object", masked=False - ) + names = column.column_empty(0, dtype="object") return string._datetime_to_str_typecast_functions[self.dtype]( self, format, names ) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 34975fc94f4..dd8f58a118e 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -14,7 +14,6 @@ if TYPE_CHECKING: from typing_extensions import Self - from cudf._typing import ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase @@ -211,16 +210,3 @@ def element_indexing(self, index: int): if cudf.get_option("mode.pandas_compatible"): return pd.Interval(**result, closed=self.dtype.closed) return result - - def _reduce( - self, - op: str, - skipna: bool | None = None, - min_count: int = 0, - *args, - **kwargs, - ) -> ScalarLike: - result = super()._reduce(op, skipna, min_count, *args, **kwargs) - if cudf.get_option("mode.pandas_compatible"): - return pd.Interval(**result, closed=self.dtype.closed) - return result diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 790cd6ea9bb..f099cef3331 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -420,22 +420,12 @@ def all(self, skipna: bool = True) -> bool: # If all entries are null the result is True, including when the column # is empty. result_col = self.nans_to_nulls() if skipna else self - - if result_col.null_count == result_col.size: - return True - - return libcudf.reduce.reduce("all", result_col) + return super(type(self), result_col).all(skipna=skipna) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. result_col = self.nans_to_nulls() if skipna else self - - if not skipna and result_col.has_nulls(): - return True - elif skipna and result_col.null_count == result_col.size: - return False - - return libcudf.reduce.reduce("any", result_col) + return super(type(self), result_col).any(skipna=skipna) @functools.cached_property def nan_count(self) -> int: @@ -483,19 +473,6 @@ def _process_values_for_isin( def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls(include_nan=True) - def _process_for_reduction( - self, skipna: bool | None = None, min_count: int = 0 - ) -> NumericalColumn | ScalarLike: - skipna = True if skipna is None else skipna - - if self._can_return_nan(skipna=skipna): - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - col = self.nans_to_nulls() if skipna else self - return super(NumericalColumn, col)._process_for_reduction( - skipna=skipna, min_count=min_count - ) - def find_and_replace( self, to_replace: ColumnLike, @@ -741,6 +718,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: return super()._reduction_result_dtype(reduction_op) + @acquire_spill_lock() + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: + """Return the indices of the bins to which each value in column belongs. + + Parameters + ---------- + bins : np.ndarray + 1-D column-like object of bins with same type as `column`, should be + monotonically increasing. + right : bool + Indicates whether interval contains the right or left bin edge. + + Returns + ------- + A column containing the indices + """ + if self.dtype != bins.dtype: + raise ValueError( + "digitize() expects bins and input column have the same dtype." + ) + + bin_col = as_column(bins, dtype=bins.dtype) + if bin_col.nullable: + raise ValueError("`bins` cannot contain null entries.") + + return type(self).from_pylibcudf( # type: ignore[return-value] + getattr(plc.search, "lower_bound" if right else "upper_bound")( + plc.Table([bin_col.to_pylibcudf(mode="read")]), + plc.Table([self.to_pylibcudf(mode="read")]), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.BEFORE], + ) + ) + def _normalize_find_and_replace_input( input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list @@ -795,34 +806,3 @@ def _normalize_find_and_replace_input( if not normalized_column.can_cast_safely(input_column_dtype): return normalized_column return normalized_column.astype(input_column_dtype) - - -def digitize( - column: ColumnBase, bins: np.ndarray, right: bool = False -) -> ColumnBase: - """Return the indices of the bins to which each value in column belongs. - - Parameters - ---------- - column : Column - Input column. - bins : Column-like - 1-D column-like object of bins with same type as `column`, should be - monotonically increasing. - right : bool - Indicates whether interval contains the right or left bin edge. - - Returns - ------- - A column containing the indices - """ - if not column.dtype == bins.dtype: - raise ValueError( - "Digitize() expects bins and input column have the same dtype." - ) - - bin_col = as_column(bins, dtype=bins.dtype) - if bin_col.nullable: - raise ValueError("`bins` cannot contain null entries.") - - return as_column(libcudf.sort.digitize([column], [bin_col], right)) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3f9abdabc2f..aaf2239a71e 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -10,7 +10,7 @@ import pylibcudf as plc import cudf -from cudf import _lib as libcudf +from cudf.core._internals import sorting from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.missing import NA @@ -139,12 +139,12 @@ def quantile( result = cast( NumericalBaseColumn, cudf.core.column.column_empty( - row_count=len(q), dtype=self.dtype, masked=True + row_count=len(q), dtype=self.dtype ), ) else: # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( + indices = sorting.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) with acquire_spill_lock(): @@ -263,6 +263,6 @@ def round( ) def _scan(self, op: str) -> ColumnBase: - return libcudf.reduce.scan( - op.replace("cum", ""), self, True - )._with_type_metadata(self.dtype) + return self.scan(op.replace("cum", ""), True)._with_type_metadata( + self.dtype + ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06196717ce3..d76caa5c3b8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -20,7 +20,7 @@ import cudf.core.column.column as column import cudf.core.column.datetime as datetime from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast, strings as libstrings +from cudf._lib import string_casting as str_cast from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype @@ -45,6 +45,7 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.lists import ListColumn from cudf.core.column.numerical import NumericalColumn @@ -624,7 +625,7 @@ def join( def _split_by_character(self): col = self._column.fillna("") # sanitize nulls - result_col = libstrings.character_tokenize(col) + result_col = col.character_tokenize() offset_col = col.children[0] @@ -4693,9 +4694,7 @@ def normalize_spaces(self) -> SeriesOrIndex: 1 test string dtype: object """ - return self._return_or_inplace( - libstrings.normalize_spaces(self._column) - ) + return self._return_or_inplace(self._column.normalize_spaces()) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" @@ -4743,7 +4742,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: dtype: object """ return self._return_or_inplace( - libstrings.normalize_characters(self._column, do_lower) + self._column.normalize_characters(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -4775,16 +4774,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: 2 goodbye dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + if isinstance(delim, Column): result = self._return_or_inplace( - libstrings._tokenize_column(self._column, delimiter), + self._column.tokenize_column(delim), retain_index=False, ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): result = self._return_or_inplace( - libstrings._tokenize_scalar(self._column, delimiter), + self._column.tokenize_scalar(delim), retain_index=False, ) else: @@ -4799,7 +4798,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: return result def detokenize( - self, indices: "cudf.Series", separator: str = " " + self, indices: cudf.Series, separator: str = " " ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order @@ -4829,9 +4828,9 @@ def detokenize( 2 three dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.detokenize(self._column, indices._column, separator), + self._column.detokenize(indices._column, sep), # type: ignore[arg-type] retain_index=False, ) @@ -4882,17 +4881,15 @@ def character_tokenize(self) -> SeriesOrIndex: 2 . dtype: object """ - result_col = libstrings.character_tokenize(self._column) + result_col = self._column.character_tokenize() if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) - return cudf.Series._from_column( + return type(self._parent)._from_column( result_col, name=self._parent.name, index=index ) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index._from_column(result_col, name=self._parent.name) else: - return result_col + return self._return_or_inplace(result_col) def token_count(self, delimiter: str = " ") -> SeriesOrIndex: """ @@ -4919,15 +4916,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: 2 0 dtype: int32 """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) + if isinstance(delim, Column): return self._return_or_inplace( - libstrings._count_tokens_column(self._column, delimiter) + self._column.count_tokens_column(delim) ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): return self._return_or_inplace( - libstrings._count_tokens_scalar(self._column, delimiter) + self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) else: raise TypeError( @@ -4966,9 +4963,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: 2 xyz_hhh dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.generate_ngrams(self._column, n, separator), + self._column.generate_ngrams(n, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5015,7 +5012,7 @@ def character_ngrams( dtype: list """ result = self._return_or_inplace( - libstrings.generate_character_ngrams(self._column, n), + self._column.generate_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5060,7 +5057,7 @@ def hash_character_ngrams( """ result = self._return_or_inplace( - libstrings.hash_character_ngrams(self._column, n), + self._column.hash_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5098,10 +5095,10 @@ def ngrams_tokenize( 2 best_book dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter") - separator = _massage_string_arg(separator, "separator") + delim = _massage_string_arg(delimiter, "delimiter") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.ngrams_tokenize(self._column, n, delimiter, separator), + self._column.ngrams_tokenize(n, delim, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5180,10 +5177,9 @@ def replace_tokens( ) return self._return_or_inplace( - libstrings.replace_tokens( - self._column, - targets_column, - replacements_column, + self._column.replace_tokens( + targets_column, # type: ignore[arg-type] + replacements_column, # type: ignore[arg-type] cudf.Scalar(delimiter, dtype="str"), ), ) @@ -5251,8 +5247,7 @@ def filter_tokens( ) return self._return_or_inplace( - libstrings.filter_tokens( - self._column, + self._column.filter_tokens( min_token_length, cudf.Scalar(replacement, dtype="str"), cudf.Scalar(delimiter, dtype="str"), @@ -5278,9 +5273,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex: 1 2 dtype: int32 """ - return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._column) - ) + return self._return_or_inplace(self._column.porter_stemmer_measure()) def is_consonant(self, position) -> SeriesOrIndex: """ @@ -5313,17 +5306,10 @@ def is_consonant(self, position) -> SeriesOrIndex: 1 False dtype: bool """ - ltype = libstrings.LetterType.CONSONANT - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(False, position) # type: ignore[arg-type] ) def is_vowel(self, position) -> SeriesOrIndex: @@ -5357,17 +5343,10 @@ def is_vowel(self, position) -> SeriesOrIndex: 1 True dtype: bool """ - ltype = libstrings.LetterType.VOWEL - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(True, position) # type: ignore[arg-type] ) def edit_distance(self, targets) -> SeriesOrIndex: @@ -5416,7 +5395,7 @@ def edit_distance(self, targets) -> SeriesOrIndex: ) return self._return_or_inplace( - libstrings.edit_distance(self._column, targets_column) + self._column.edit_distance(targets_column) # type: ignore[arg-type] ) def edit_distance_matrix(self) -> SeriesOrIndex: @@ -5456,9 +5435,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace( - libstrings.edit_distance_matrix(self._column) - ) + return self._return_or_inplace(self._column.edit_distance_matrix()) def minhash( self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int @@ -5508,7 +5485,7 @@ def minhash( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash(self._column, seed, a_column, b_column, width) + self._column.minhash(seed, a_column, b_column, width) # type: ignore[arg-type] ) def minhash64( @@ -5559,7 +5536,7 @@ def minhash64( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64(self._column, seed, a_column, b_column, width) + self._column.minhash64(seed, a_column, b_column, width) # type: ignore[arg-type] ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: @@ -5585,13 +5562,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: 1 0.307692 dtype: float32 """ - return self._return_or_inplace( - libstrings.jaccard_index(self._column, input._column, width), + self._column.jaccard_index(input._column, width) ) -def _massage_string_arg(value, name, allow_col=False): +def _massage_string_arg( + value, name, allow_col: bool = False +) -> StringColumn | cudf.Scalar: if isinstance(value, cudf.Scalar): return value @@ -5602,9 +5580,9 @@ def _massage_string_arg(value, name, allow_col=False): if allow_col: if isinstance(value, list): - return column.as_column(value, dtype="str") + return column.as_column(value, dtype="str") # type: ignore[return-value] - if isinstance(value, Column) and is_string_dtype(value.dtype): + if isinstance(value, StringColumn): return value allowed_types.append("Column") @@ -5877,7 +5855,7 @@ def strptime( f"dtype must be datetime or timedelta type, not {dtype}" ) elif self.null_count == len(self): - return column.column_empty(len(self), dtype=dtype, masked=True) # type: ignore[return-value] + return column.column_empty(len(self), dtype=dtype) # type: ignore[return-value] elif (self == "None").any(): raise ValueError( "Cannot convert `None` value to datetime or timedelta." @@ -6148,6 +6126,278 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) + @acquire_spill_lock() + def minhash( + self, + seed: np.uint32, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def minhash64( + self, + seed: np.uint64, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash64( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def jaccard_index(self, other: Self, width: int) -> NumericalColumn: + result = plc.nvtext.jaccard.jaccard_index( + self.to_pylibcudf(mode="read"), + other.to_pylibcudf(mode="read"), + width, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + result = plc.nvtext.generate_ngrams.generate_ngrams( + self.to_pylibcudf(mode="read"), + ngrams, + separator.device_value.c_value, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def hash_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance(self, targets: Self) -> NumericalColumn: + result = plc.nvtext.edit_distance.edit_distance( + self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance_matrix(self) -> ListColumn: + result = plc.nvtext.edit_distance.edit_distance_matrix( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def byte_pair_encoding( + self, + merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.byte_pair_encode.byte_pair_encoding( + self.to_pylibcudf(mode="read"), + merge_pairs, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def ngrams_tokenize( + self, + ngrams: int, + delimiter: cudf.Scalar, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.ngrams_tokenize.ngrams_tokenize( + self.to_pylibcudf(mode="read"), + ngrams, + delimiter.device_value.c_value, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def normalize_spaces(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_spaces( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def normalize_characters(self, do_lower: bool = True) -> Self: + return Column.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + do_lower, + ) + ) + + @acquire_spill_lock() + def replace_tokens( + self, targets: Self, replacements: Self, delimiter: cudf.Scalar + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.replace_tokens( + self.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def filter_tokens( + self, + min_token_length: int, + replacement: cudf.Scalar, + delimiter: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.filter_tokens( + self.to_pylibcudf(mode="read"), + min_token_length, + replacement.device_value.c_value, + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def porter_stemmer_measure(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.porter_stemmer_measure( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.is_letter( + self.to_pylibcudf(mode="read"), + is_vowel, + index + if isinstance(index, int) + else index.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def subword_tokenize( + self, + hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary, + max_sequence_length: int = 64, + stride: int = 48, + do_lower: bool = True, + do_truncate: bool = False, + ) -> tuple[ColumnBase, ColumnBase, ColumnBase]: + """ + Subword tokenizes text series by using the pre-loaded hashed vocabulary + """ + result = plc.nvtext.subword_tokenize.subword_tokenize( + self.to_pylibcudf(mode="read"), + hashed_vocabulary, + max_sequence_length, + stride, + do_lower, + do_truncate, + ) + # return the 3 tensor components + tokens = type(self).from_pylibcudf(result[0]) + masks = type(self).from_pylibcudf(result[1]) + metadata = type(self).from_pylibcudf(result[2]) + return tokens, masks, metadata + + @acquire_spill_lock() + def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def tokenize_column(self, delimiters: Self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def count_tokens_column(self, delimiters: Self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def character_tokenize(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.character_tokenize( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def tokenize_with_vocabulary( + self, + vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, + delimiter: cudf.Scalar, + default_id: int, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_with_vocabulary( + self.to_pylibcudf(mode="read"), + vocabulary, + delimiter.device_value.c_value, + default_id, + ) + ) + + @acquire_spill_lock() + def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.detokenize( + self.to_pylibcudf(mode="read"), + indices.to_pylibcudf(mode="read"), + separator.device_value.c_value, + ) + ) + def _modify_characters( self, method: Callable[[plc.Column], plc.Column] ) -> Self: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index db6ad72ab56..ba765b50729 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -107,12 +107,9 @@ def memory_usage(self) -> int: return n - def element_indexing(self, index: int): + def element_indexing(self, index: int) -> dict: result = super().element_indexing(index) - return { - field: value - for field, value in zip(self.dtype.fields, result.values()) - } + return dict(zip(self.dtype.fields, result.values())) def __setitem__(self, key, value): if isinstance(value, dict): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index f3a7916aa35..8b1515acae2 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -294,7 +294,7 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: if len(self) == 0: return cast( cudf.core.column.StringColumn, - column.column_empty(0, dtype="object", masked=False), + column.column_empty(0, dtype="object"), ) else: return string._timedelta_to_str_typecast_functions[self.dtype]( diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 16d8964f083..4b6ad59c8e1 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -5,7 +5,6 @@ from typing_extensions import Self import cudf -import cudf._lib as libcudf from cudf._lib.types import size_type_dtype if TYPE_CHECKING: @@ -70,8 +69,8 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): if self.column.dtype.kind not in {"i", "u"}: raise TypeError("Gather map must have integer dtype") if not nullify: - lo, hi = libcudf.reduce.minmax(self.column) - if lo.value < -nrows or hi.value >= nrows: + lo, hi = self.column.minmax() + if lo < -nrows or hi >= nrows: raise IndexError( f"Gather map is out of bounds for [0, {nrows})" ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 325601e5311..fce361e18ea 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7,7 +7,6 @@ import itertools import numbers import os -import pickle import re import sys import textwrap @@ -50,7 +49,6 @@ ) from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column import ( CategoricalColumn, @@ -588,7 +586,7 @@ class _DataFrameiAtIndexer(_DataFrameIlocIndexer): pass -class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): +class DataFrame(IndexedFrame, GetAttrGetItemMixin): """ A GPU Dataframe object. @@ -776,9 +774,7 @@ def __init__( label_dtype = getattr(columns, "dtype", None) self._data = ColumnAccessor( { - k: column.column_empty( - len(self), dtype="object", masked=True - ) + k: column_empty(len(self), dtype="object") for k in columns }, level_names=tuple(columns.names) @@ -981,8 +977,8 @@ def _init_from_series_list(self, data, columns, index): if columns is not None: for col_name in columns: if col_name not in self._data: - self._data[col_name] = column.column_empty( - row_count=len(self), dtype=None, masked=True + self._data[col_name] = column_empty( + row_count=len(self), dtype=None ) self._data._level_names = ( tuple(columns.names) @@ -1033,11 +1029,7 @@ def _init_from_list_like(self, data, index=None, columns=None): data = list(itertools.zip_longest(*data)) if columns is not None and len(data) == 0: - data = [ - cudf.core.column.column_empty(row_count=0, dtype=None) - for _ in columns - ] - + data = [column_empty(row_count=0, dtype=None) for _ in columns] for col_name, col in enumerate(data): self._data[col_name] = column.as_column(col) self._data.rangeindex = True @@ -1076,9 +1068,8 @@ def _init_from_dict_like( # the provided index, so we need to return a masked # array of nulls if an index is given. empty_column = functools.partial( - cudf.core.column.column_empty, - row_count=(0 if index is None else len(index)), - masked=index is not None, + column_empty, + row_count=0 if index is None else len(index), ) data = { @@ -1190,7 +1181,7 @@ def _constructor_expanddim(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -1205,8 +1196,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj @@ -1424,7 +1414,7 @@ def __setitem__(self, arg, value): new_columns = ( value if key == arg - else column.column_empty( + else column_empty( row_count=length, dtype=col.dtype ) for key, col in self._column_labels_and_values @@ -2508,16 +2498,7 @@ def scatter_by_map( ) if map_index.size > 0: - plc_lo, plc_hi = plc.reduce.minmax( - map_index.to_pylibcudf(mode="read") - ) - # TODO: Use pylibcudf Scalar once APIs are more developed - lo = libcudf.column.Column.from_pylibcudf( - plc.Column.from_scalar(plc_lo, 1) - ).element_indexing(0) - hi = libcudf.column.Column.from_pylibcudf( - plc.Column.from_scalar(plc_hi, 1) - ).element_indexing(0) + lo, hi = map_index.minmax() if lo < 0 or hi >= map_size: raise ValueError("Partition map has invalid values") @@ -3385,7 +3366,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if num_cols != 0: ca = self._data._from_columns_like_self( ( - column.column_empty(row_count=length, dtype=dtype) + column_empty(row_count=length, dtype=dtype) for _, dtype in self._dtypes ), verify=False, @@ -3491,7 +3472,7 @@ def diff(self, periods=1, axis=0): if abs(periods) > len(self): df = cudf.DataFrame._from_data( { - name: column_empty(len(self), dtype=dtype, masked=True) + name: column_empty(len(self), dtype=dtype) for name, dtype in zip(self._column_names, self.dtypes) } ) @@ -3871,9 +3852,7 @@ def agg(self, aggs, axis=None): result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): col = self[key] - col_empty = column_empty( - len(idxs), dtype=col.dtype, masked=True - ) + col_empty = column_empty(len(idxs), dtype=col.dtype) ans = cudf.Series._from_column( col_empty, index=cudf.Index(idxs) ) @@ -6189,9 +6168,7 @@ def quantile( quant_index=False, )._column if len(res) == 0: - res = column.column_empty( - row_count=len(qs), dtype=ser.dtype - ) + res = column_empty(row_count=len(qs), dtype=ser.dtype) result[k] = res result = DataFrame._from_data(result) @@ -7345,9 +7322,7 @@ def unnamed_group_generator(): ) all_nulls = functools.cache( - functools.partial( - column_empty, self.shape[0], common_type, masked=True - ) + functools.partial(column_empty, self.shape[0], common_type) ) # homogenize the dtypes of the columns @@ -8594,7 +8569,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): # If column not in this df, fill with an all-null column if idx >= len(cols) or cols[idx] is None: n = len(next(x for x in cols if x is not None)) - cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True) + cols[idx] = column_empty(row_count=n, dtype=dtype) else: # If column is categorical, rebase the codes with the # combined categories, and cast the new codes to the diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 801020664da..971f0be77f8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -3,7 +3,6 @@ import decimal import operator -import pickle import textwrap import warnings from functools import cached_property @@ -91,13 +90,13 @@ def dtype(arbitrary): raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype") -def _decode_type( +def _check_type( cls: type, header: dict, frames: list, is_valid_class: Callable[[type, type], bool] = operator.is_, -) -> tuple[dict, list, type]: - """Decode metadata-encoded type and check validity +) -> None: + """Perform metadata-encoded type and check validity Parameters ---------- @@ -112,12 +111,6 @@ class performing deserialization serialization by `cls` (default is to check type equality), called as `is_valid_class(decoded_class, cls)`. - Returns - ------- - tuple - Tuple of validated headers, frames, and the decoded class - constructor. - Raises ------ AssertionError @@ -128,11 +121,11 @@ class performing deserialization f"Deserialization expected {header['frame_count']} frames, " f"but received {len(frames)}." ) - klass = pickle.loads(header["type-serialized"]) + klass = Serializable._name_type_map[header["type-serialized-name"]] assert is_valid_class( - klass, cls + klass, + cls, ), f"Header-encoded {klass=} does not match decoding {cls=}." - return header, frames, klass class _BaseDtype(ExtensionDtype, Serializable): @@ -196,9 +189,7 @@ def categories(self) -> cudf.Index: Index(['b', 'a'], dtype='object') """ if self._categories is None: - col = cudf.core.column.column_empty( - 0, dtype="object", masked=False - ) + col = cudf.core.column.column_empty(0, dtype="object") else: col = self._categories return cudf.Index._from_column(col) @@ -305,13 +296,14 @@ def construct_from_string(self): def serialize(self): header = {} - header["type-serialized"] = pickle.dumps(type(self)) header["ordered"] = self.ordered frames = [] if self.categories is not None: - categories_header, categories_frames = self.categories.serialize() + categories_header, categories_frames = ( + self.categories.device_serialize() + ) header["categories"] = categories_header frames.extend(categories_frames) header["frame_count"] = len(frames) @@ -319,15 +311,14 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) ordered = header["ordered"] categories_header = header["categories"] categories_frames = frames - categories_type = pickle.loads(categories_header["type-serialized"]) - categories = categories_type.deserialize( + categories = Serializable.device_deserialize( categories_header, categories_frames ) - return klass(categories=categories, ordered=ordered) + return cls(categories=categories, ordered=ordered) def __repr__(self): return self.to_pandas().__repr__() @@ -495,12 +486,13 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Dtype] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames = [] if isinstance(self.element_type, _BaseDtype): - header["element-type"], frames = self.element_type.serialize() + header["element-type"], frames = ( + self.element_type.device_serialize() + ) else: header["element-type"] = getattr( self.element_type, "name", self.element_type @@ -510,14 +502,14 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) if isinstance(header["element-type"], dict): - element_type = pickle.loads( - header["element-type"]["type-serialized"] - ).deserialize(header["element-type"], frames) + element_type = Serializable.device_deserialize( + header["element-type"], frames + ) else: element_type = header["element-type"] - return klass(element_type=element_type) + return cls(element_type=element_type) @cached_property def itemsize(self): @@ -641,7 +633,6 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames: list[Buffer] = [] @@ -649,33 +640,31 @@ def serialize(self) -> tuple[dict, list]: for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): - dtype_header, dtype_frames = dtype.serialize() + dtype_header, dtype_frames = dtype.device_serialize() fields[k] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) else: - fields[k] = pickle.dumps(dtype) + fields[k] = dtype.str header["fields"] = fields header["frame_count"] = len(frames) return header, frames @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) fields = {} for k, dtype in header["fields"].items(): if isinstance(dtype, tuple): dtype_header, (start, stop) = dtype - fields[k] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize( + fields[k] = Serializable.device_deserialize( dtype_header, frames[start:stop], ) else: - fields[k] = pickle.loads(dtype) + fields[k] = np.dtype(dtype) return cls(fields) @cached_property @@ -838,7 +827,6 @@ def _from_decimal(cls, decimal): def serialize(self) -> tuple[dict, list]: return ( { - "type-serialized": pickle.dumps(type(self)), "precision": self.precision, "scale": self.scale, "frame_count": 0, @@ -848,11 +836,8 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type( - cls, header, frames, is_valid_class=issubclass - ) - klass = pickle.loads(header["type-serialized"]) - return klass(header["precision"], header["scale"]) + _check_type(cls, header, frames, is_valid_class=issubclass) + return cls(header["precision"], header["scale"]) def __eq__(self, other: Dtype) -> bool: if other is self: @@ -960,18 +945,17 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header = { - "type-serialized": pickle.dumps(type(self)), - "fields": pickle.dumps((self.subtype, self.closed)), + "fields": (self.subtype.str, self.closed), "frame_count": 0, } return header, [] @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) - klass = pickle.loads(header["type-serialized"]) - subtype, closed = pickle.loads(header["fields"]) - return klass(subtype, closed=closed) + _check_type(cls, header, frames) + subtype, closed = header["fields"] + subtype = np.dtype(subtype) + return cls(subtype, closed=closed) def _is_categorical_dtype(obj): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 84a3caf905f..4f40ba0bd92 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,7 +3,6 @@ from __future__ import annotations import operator -import pickle import warnings from collections import abc from typing import TYPE_CHECKING, Any, Literal @@ -23,7 +22,9 @@ from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sorting from cudf.core._internals.search import search_sorted +from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -47,7 +48,7 @@ # TODO: It looks like Frame is missing a declaration of `copy`, need to add -class Frame(BinaryOperand, Scannable): +class Frame(BinaryOperand, Scannable, Serializable): """A collection of Column objects with an optional index. Parameters @@ -97,37 +98,80 @@ def ndim(self) -> int: @_performance_tracking def serialize(self): # TODO: See if self._data can be serialized outright + frames = [] header = { - "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(self._column_names), - "column_rangeindex": pickle.dumps(self._data.rangeindex), - "column_multiindex": pickle.dumps(self._data.multiindex), - "column_label_dtype": pickle.dumps(self._data.label_dtype), - "column_level_names": pickle.dumps(self._data._level_names), + "column_label_dtype": None, + "dtype-is-cudf-serialized": False, } - header["columns"], frames = serialize_columns(self._columns) + if (label_dtype := self._data.label_dtype) is not None: + try: + header["column_label_dtype"], frames = ( + label_dtype.device_serialize() + ) + header["dtype-is-cudf-serialized"] = True + except AttributeError: + header["column_label_dtype"] = label_dtype.str + + header["columns"], column_frames = serialize_columns(self._columns) + column_names, column_names_numpy_type = ( + zip( + *[ + (cname.item(), type(cname).__name__) + if isinstance(cname, np.generic) + else (cname, "") + for cname in self._column_names + ] + ) + if self._column_names + else ((), ()) + ) + header |= { + "column_names": column_names, + "column_names_numpy_type": column_names_numpy_type, + "column_rangeindex": self._data.rangeindex, + "column_multiindex": self._data.multiindex, + "column_level_names": self._data._level_names, + } + frames.extend(column_frames) + return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): - cls_deserialize = pickle.loads(header["type-serialized"]) - column_names = pickle.loads(header["column_names"]) - columns = deserialize_columns(header["columns"], frames) kwargs = {} + dtype_header = header["column_label_dtype"] + if header["dtype-is-cudf-serialized"]: + count = dtype_header["frame_count"] + kwargs["label_dtype"] = cls.device_deserialize( + header, frames[:count] + ) + frames = frames[count:] + else: + kwargs["label_dtype"] = ( + np.dtype(dtype_header) if dtype_header is not None else None + ) + + columns = deserialize_columns(header["columns"], frames) for metadata in [ "rangeindex", "multiindex", - "label_dtype", "level_names", ]: key = f"column_{metadata}" if key in header: - kwargs[metadata] = pickle.loads(header[key]) + kwargs[metadata] = header[key] + + column_names = [ + getattr(np, cntype)(cname) if cntype != "" else cname + for cname, cntype in zip( + header["column_names"], header["column_names_numpy_type"] + ) + ] col_accessor = ColumnAccessor( data=dict(zip(column_names, columns)), **kwargs ) - return cls_deserialize._from_data(col_accessor) + return cls._from_data(col_accessor) @classmethod @_performance_tracking @@ -1433,7 +1477,7 @@ def _get_sorted_inds( else: ascending_lst = list(ascending) - return libcudf.sort.order_by( + return sorting.order_by( list(to_sort), ascending_lst, na_position, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 0f12f266a95..b772d35846d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -3,7 +3,6 @@ import copy import itertools -import pickle import textwrap import warnings from collections import abc @@ -19,11 +18,11 @@ import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby -from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sorting from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -494,9 +493,7 @@ def size(self): """ Return the size of each group. """ - col = cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ) + col = cudf.core.column.column_empty(len(self.obj), "int8") result = ( cudf.Series._from_column(col, name=getattr(self.obj, "name", None)) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) @@ -524,7 +521,8 @@ def cumcount(self, ascending: bool = True): return ( cudf.Series._from_column( cudf.core.column.column_empty( - len(self.obj), "int8", masked=False + len(self.obj), + "int8", ), index=self.obj.index, ) @@ -794,7 +792,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): # want, and right order is a matching gather map for # the result table. Get the correct order by sorting # the right gather map. - (right_order,) = libcudf.sort.sort_by_key( + (right_order,) = sorting.sort_by_key( [right_order], [left_order], [True], @@ -1250,15 +1248,20 @@ def sample( for off, size in zip(group_offsets, size_per_group): rs.shuffle(indices[off : off + size]) else: - rng = cp.random.default_rng(seed=random_state) - (indices,) = segmented_sort_by_key( - [as_column(indices)], - [as_column(rng.random(size=nrows))], - as_column(group_offsets), - [], - [], - stable=True, + keys = cp.random.default_rng(seed=random_state).random( + size=nrows ) + with acquire_spill_lock(): + plc_table = plc.sorting.stable_segmented_sort_by_key( + plc.Table( + [as_column(indices).to_pylibcudf(mode="read")] + ), + plc.Table([as_column(keys).to_pylibcudf(mode="read")]), + as_column(group_offsets).to_pylibcudf(mode="read"), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER], + ) + indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) @@ -1281,7 +1284,7 @@ def serialize(self): obj_header, obj_frames = self.obj.serialize() header["obj"] = obj_header - header["obj_type"] = pickle.dumps(type(self.obj)) + header["obj_type_name"] = type(self.obj).__name__ header["num_obj_frames"] = len(obj_frames) frames.extend(obj_frames) @@ -1296,7 +1299,7 @@ def serialize(self): def deserialize(cls, header, frames): kwargs = header["kwargs"] - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) @@ -3329,8 +3332,8 @@ def _handle_misc(self, by): def serialize(self): header = {} frames = [] - header["names"] = pickle.dumps(self.names) - header["_named_columns"] = pickle.dumps(self._named_columns) + header["names"] = self.names + header["_named_columns"] = self._named_columns column_header, column_frames = cudf.core.column.serialize_columns( self._key_columns ) @@ -3340,8 +3343,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index cc3d8448151..8d3ef1036d1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3,7 +3,6 @@ from __future__ import annotations import operator -import pickle import warnings from collections.abc import Hashable, MutableMapping from functools import cache, cached_property @@ -337,7 +336,7 @@ def _values(self) -> ColumnBase: if len(self) > 0: return column.as_column(self._range, dtype=self.dtype) else: - return column.column_empty(0, masked=False, dtype=self.dtype) + return column.column_empty(0, dtype=self.dtype) def _clean_nulls_from_index(self) -> Self: return self @@ -497,9 +496,8 @@ def serialize(self): header["index_column"]["step"] = self.step frames = [] - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) + header["name"] = self.name + header["dtype"] = self.dtype.str header["frame_count"] = 0 return header, frames @@ -507,11 +505,14 @@ def serialize(self): @_performance_tracking def deserialize(cls, header, frames): h = header["index_column"] - name = pickle.loads(header["name"]) + name = header["name"] start = h["start"] stop = h["stop"] step = h.get("step", 1) - return RangeIndex(start=start, stop=stop, step=step, name=name) + dtype = np.dtype(header["dtype"]) + return RangeIndex( + start=start, stop=stop, step=step, dtype=dtype, name=name + ) @property # type: ignore @_performance_tracking diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 0e6a5e03ea6..1a667e24bef 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3851,7 +3851,6 @@ def _reindex( if name in df._data else cudf.core.column.column.column_empty( dtype=dtypes.get(name, np.float64), - masked=True, row_count=len(index), ) ) @@ -6368,9 +6367,49 @@ def rank( elif source._num_columns != num_cols: dropped_cols = True - result_columns = libcudf.sort.rank_columns( - [*source._columns], method_enum, na_option, ascending, pct + column_order = ( + plc.types.Order.ASCENDING + if ascending + else plc.types.Order.DESCENDING ) + # ascending + # #top = na_is_smallest + # #bottom = na_is_largest + # #keep = na_is_largest + # descending + # #top = na_is_largest + # #bottom = na_is_smallest + # #keep = na_is_smallest + if ascending: + if na_option == "top": + null_precedence = plc.types.NullOrder.BEFORE + else: + null_precedence = plc.types.NullOrder.AFTER + else: + if na_option == "top": + null_precedence = plc.types.NullOrder.AFTER + else: + null_precedence = plc.types.NullOrder.BEFORE + c_null_handling = ( + plc.types.NullPolicy.EXCLUDE + if na_option == "keep" + else plc.types.NullPolicy.INCLUDE + ) + + with acquire_spill_lock(): + result_columns = [ + libcudf.column.Column.from_pylibcudf( + plc.sorting.rank( + col.to_pylibcudf(mode="read"), + method_enum, + column_order, + c_null_handling, + null_precedence, + pct, + ) + ) + for col in source._columns + ] if dropped_cols: result = type(source)._from_data( diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 5c224176730..e7ea91c1f21 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -9,6 +9,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.types import size_type_dtype +from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import ( @@ -256,7 +257,7 @@ def _gather_maps(self, left_cols, right_cols): for map_, n, null in zip(maps, lengths, nullify) ) ) - return libcudf.sort.sort_by_key( + return sorting.sort_by_key( list(maps), # If how is right, right map is primary sort key. key_order[:: -1 if self.how == "right" else 1], @@ -426,7 +427,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: else: to_sort = [*result._columns] index_names = None - result_columns = libcudf.sort.sort_by_key( + result_columns = sorting.sort_by_key( to_sort, by, [True] * len(by), diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 173d4e1c584..a99e06e4a8e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -5,7 +5,6 @@ import itertools import numbers import operator -import pickle import warnings from functools import cached_property from typing import TYPE_CHECKING, Any @@ -23,6 +22,7 @@ from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result +from cudf.core._internals import sorting from cudf.core.algorithms import factorize from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor @@ -192,12 +192,12 @@ def __init__( source_data = {} for i, (code, level) in enumerate(zip(new_codes, new_levels)): if len(code): - lo, hi = libcudf.reduce.minmax(code) - if lo.value < -1 or hi.value > len(level) - 1: + lo, hi = code.minmax() + if lo < -1 or hi > len(level) - 1: raise ValueError( f"Codes must be -1 <= codes <= {len(level) - 1}" ) - if lo.value == -1: + if lo == -1: # Now we can gather and insert null automatically code[code == -1] = np.iinfo(size_type_dtype).min result_col = libcudf.copying.gather( @@ -921,15 +921,15 @@ def take(self, indices) -> Self: def serialize(self): header, frames = super().serialize() # Overwrite the names in _data with the true names. - header["column_names"] = pickle.dumps(self.names) + header["column_names"] = self.names return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): # Spoof the column names to construct the frame, then set manually. - column_names = pickle.loads(header["column_names"]) - header["column_names"] = pickle.dumps(range(0, len(column_names))) + column_names = header["column_names"] + header["column_names"] = range(0, len(column_names)) obj = super().deserialize(header, frames) return obj._set_names(column_names) @@ -1678,7 +1678,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool: f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" ) - return libcudf.sort.is_sorted( + return sorting.is_sorted( [*self._columns], ascending=ascending, null_position=null_position ) diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index d95d252559f..391ee31f125 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -15,7 +15,6 @@ # limitations under the License. from __future__ import annotations -import pickle import warnings from typing import TYPE_CHECKING @@ -26,6 +25,7 @@ import cudf from cudf._lib.column import Column +from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.groupby.groupby import ( DataFrameGroupBy, @@ -97,21 +97,21 @@ def serialize(self): header, frames = super().serialize() grouping_head, grouping_frames = self.grouping.serialize() header["grouping"] = grouping_head - header["resampler_type"] = pickle.dumps(type(self)) + header["resampler_type"] = type(self).__name__ header["grouping_frames_count"] = len(grouping_frames) frames.extend(grouping_frames) return header, frames @classmethod def deserialize(cls, header, frames): - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) grouping = _ResampleGrouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) - resampler_cls = pickle.loads(header["resampler_type"]) + resampler_cls = Serializable._name_type_map[header["resampler_type"]] out = resampler_cls.__new__(resampler_cls) out.grouping = grouping super().__init__(out, obj, by=grouping) @@ -163,8 +163,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames[: -header["__bin_labels_count"]] ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index be74b0f867a..961e5e11bc0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4,7 +4,6 @@ import functools import inspect -import pickle import textwrap import warnings from collections import abc @@ -27,7 +26,6 @@ ) from cudf.core import indexing_utils from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -414,7 +412,7 @@ def _loc_to_iloc(self, arg): return indices -class Series(SingleColumnFrame, IndexedFrame, Serializable): +class Series(SingleColumnFrame, IndexedFrame): """ One-dimensional GPU array (including time series). @@ -899,7 +897,7 @@ def hasnans(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -915,8 +913,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj @@ -3413,7 +3410,7 @@ def describe( ) @_performance_tracking - def digitize(self, bins, right=False): + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: """Return the indices of the bins to which each value belongs. Notes @@ -3444,9 +3441,8 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - return Series._from_column( - cudf.core.column.numerical.digitize(self._column, bins, right), - name=self.name, + return type(self)._from_column( + self._column.digitize(bins, right), name=self.name ) @_performance_tracking diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index dda1f199078..479838ef2a8 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -8,10 +8,6 @@ import pylibcudf as plc -from cudf._lib.nvtext.subword_tokenize import ( - subword_tokenize_inmem_hash as cpp_subword_tokenize, -) - def _cast_to_appropriate_type(ar, cast_type): if cast_type == "cp": @@ -210,8 +206,7 @@ def __call__( stride = max_length - stride # behavior varies from subword_tokenize but maps with huggingface - input_ids, attention_mask, metadata = cpp_subword_tokenize( - text._column, + input_ids, attention_mask, metadata = text._column.subword_tokenize( self.vocab_file, max_sequence_length=max_length, stride=stride, diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 1e31376cce8..fb8b9b3131c 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.tokenize import ( - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) class TokenizeVocabulary: @@ -20,7 +17,7 @@ class TokenizeVocabulary: Strings column of vocabulary terms """ - def __init__(self, vocabulary: "cudf.Series"): + def __init__(self, vocabulary: cudf.Series) -> None: self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary( vocabulary._column.to_pylibcudf(mode="read") ) @@ -46,8 +43,8 @@ def tokenize( if delimiter is None: delimiter = "" delim = cudf.Scalar(delimiter, dtype="str") - result = cpp_tokenize_with_vocabulary( - text._column, self.vocabulary, delim, default_id + result = text._column.tokenize_with_vocabulary( + self.vocabulary, delim, default_id ) return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index 094df955273..c4a063a50e8 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -6,7 +6,6 @@ import numpy as np -from cudf._lib.reduce import scan from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase @@ -194,13 +193,8 @@ def _apply_agg_column( # as such we need to convert the nans to nulls before # passing them in. to_libcudf_column = source_column.astype("float64").nans_to_nulls() - - return scan( - agg_name, - to_libcudf_column, - True, - com=self.com, - adjust=self.adjust, + return to_libcudf_column.scan( + agg_name, True, com=self.com, adjust=self.adjust ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3dc8915bfd1..da9a66f3874 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,57 +1,73 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations +import errno +import itertools +import os import warnings from collections import abc from io import BytesIO, StringIO +from typing import cast import numpy as np +import pandas as pd + +import pylibcudf as plc import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_scalar +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf._lib.utils import data_from_pylibcudf_io +from cudf.api.types import is_hashable, is_scalar +from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils from cudf.utils.dtypes import _maybe_convert_to_default_type from cudf.utils.performance_tracking import _performance_tracking +_CSV_HEX_TYPE_MAP = { + "hex": np.dtype("int64"), + "hex64": np.dtype("int64"), + "hex32": np.dtype("int32"), +} + @_performance_tracking @ioutils.doc_read_csv() def read_csv( filepath_or_buffer, - sep=",", - delimiter=None, + sep: str = ",", + delimiter: str | None = None, header="infer", names=None, index_col=None, usecols=None, prefix=None, - mangle_dupe_cols=True, + mangle_dupe_cols: bool = True, dtype=None, true_values=None, false_values=None, - skipinitialspace=False, - skiprows=0, - skipfooter=0, - nrows=None, + skipinitialspace: bool = False, + skiprows: int = 0, + skipfooter: int = 0, + nrows: int | None = None, na_values=None, - keep_default_na=True, - na_filter=True, - skip_blank_lines=True, + keep_default_na: bool = True, + na_filter: bool = True, + skip_blank_lines: bool = True, parse_dates=None, - dayfirst=False, + dayfirst: bool = False, compression="infer", - thousands=None, - decimal=".", - lineterminator="\n", - quotechar='"', - quoting=0, - doublequote=True, - comment=None, - delim_whitespace=False, - byte_range=None, + thousands: str | None = None, + decimal: str = ".", + lineterminator: str = "\n", + quotechar: str = '"', + quoting: int = 0, + doublequote: bool = True, + comment: str | None = None, + delim_whitespace: bool = False, + byte_range: list[int] | tuple[int, int] | None = None, storage_options=None, - bytes_per_thread=None, -): + bytes_per_thread: int | None = None, +) -> cudf.DataFrame: """{docstring}""" if delim_whitespace is not False: @@ -77,60 +93,225 @@ def read_csv( if na_values is not None and is_scalar(na_values): na_values = [na_values] - df = libcudf.csv.read_csv( - filepath_or_buffer, - lineterminator=lineterminator, - quotechar=quotechar, - quoting=quoting, - doublequote=doublequote, - header=header, - mangle_dupe_cols=mangle_dupe_cols, - usecols=usecols, - sep=sep, - delimiter=delimiter, - delim_whitespace=delim_whitespace, - skipinitialspace=skipinitialspace, - names=names, - dtype=dtype, - skipfooter=skipfooter, - skiprows=skiprows, - dayfirst=dayfirst, - compression=compression, - thousands=thousands, - decimal=decimal, - true_values=true_values, - false_values=false_values, - nrows=nrows, - byte_range=byte_range, - skip_blank_lines=skip_blank_lines, - parse_dates=parse_dates, - comment=comment, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - prefix=prefix, - index_col=index_col, + if not isinstance(filepath_or_buffer, (BytesIO, StringIO, bytes)): + if not os.path.isfile(filepath_or_buffer): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), filepath_or_buffer + ) + + if isinstance(filepath_or_buffer, StringIO): + filepath_or_buffer = filepath_or_buffer.read().encode() + elif isinstance(filepath_or_buffer, str) and not os.path.isfile( + filepath_or_buffer + ): + filepath_or_buffer = filepath_or_buffer.encode() + + _validate_args( + delimiter, + sep, + delim_whitespace, + decimal, + thousands, + nrows, + skipfooter, + byte_range, + skiprows, + ) + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + delimiter = str(delimiter) + + if byte_range is None: + byte_range = (0, 0) + + if compression is None: + c_compression = plc.io.types.CompressionType.NONE + else: + compression_map = { + "infer": plc.io.types.CompressionType.AUTO, + "gzip": plc.io.types.CompressionType.GZIP, + "bz2": plc.io.types.CompressionType.BZIP2, + "zip": plc.io.types.CompressionType.ZIP, + } + c_compression = compression_map[compression] + + # We need this later when setting index cols + orig_header = header + + if names is not None: + # explicitly mentioned name, so don't check header + if header is None or header == "infer": + header = -1 + else: + header = header + names = list(names) + else: + if header is None: + header = -1 + elif header == "infer": + header = 0 + + hex_cols: list[abc.Hashable] = [] + new_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = [] + if dtype is not None: + if isinstance(dtype, abc.Mapping): + new_dtypes = {} + for k, col_type in dtype.items(): + if is_hashable(col_type) and col_type in _CSV_HEX_TYPE_MAP: + col_type = _CSV_HEX_TYPE_MAP[col_type] + hex_cols.append(str(k)) + + new_dtypes[k] = _get_plc_data_type_from_dtype( + cudf.dtype(col_type) + ) + elif cudf.api.types.is_scalar(dtype) or isinstance( + dtype, (np.dtype, pd.api.extensions.ExtensionDtype, type) + ): + if is_hashable(dtype) and dtype in _CSV_HEX_TYPE_MAP: + dtype = _CSV_HEX_TYPE_MAP[dtype] + hex_cols.append(0) + + cast(list, new_dtypes).append(_get_plc_data_type_from_dtype(dtype)) + elif isinstance(dtype, abc.Collection): + for index, col_dtype in enumerate(dtype): + if is_hashable(col_dtype) and col_dtype in _CSV_HEX_TYPE_MAP: + col_dtype = _CSV_HEX_TYPE_MAP[col_dtype] + hex_cols.append(index) + + new_dtypes.append(_get_plc_data_type_from_dtype(col_dtype)) + else: + raise ValueError( + "dtype should be a scalar/str/list-like/dict-like" + ) + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([filepath_or_buffer]) + ) + .compression(c_compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range[0]) + .byte_range_size(byte_range[1]) + .nrows(nrows if nrows is not None else -1) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(str(lineterminator)) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() + ) + + options.set_header(header) + + if names is not None: + options.set_names([str(name) for name in names]) + + if prefix is not None: + options.set_prefix(prefix) + + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + + if delimiter is not None: + options.set_delimiter(delimiter) + + if thousands is not None: + options.set_thousands(thousands) + + if comment is not None: + options.set_comment(comment) + + if parse_dates is not None: + options.set_parse_dates(list(parse_dates)) + + if hex_cols is not None: + options.set_parse_hex(list(hex_cols)) + + options.set_dtypes(new_dtypes) + + if true_values is not None: + options.set_true_values([str(val) for val in true_values]) + + if false_values is not None: + options.set_false_values([str(val) for val in false_values]) + + if na_values is not None: + options.set_na_values([str(val) for val in na_values]) + + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) ) + if isinstance(dtype, abc.Mapping): + for k, v in dtype.items(): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): + df._data[str(k)] = df._data[str(k)].astype(v) + elif dtype == "category" or isinstance(dtype, cudf.CategoricalDtype): + df = df.astype(dtype) + elif isinstance(dtype, abc.Collection) and not is_scalar(dtype): + for index, col_dtype in enumerate(dtype): + if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): + col_name = df._column_names[index] + df._data[col_name] = df._data[col_name].astype(col_dtype) + + if names is not None and len(names) and isinstance(names[0], int): + df.columns = [int(x) for x in df._data] + elif ( + names is None + and header == -1 + and cudf.get_option("mode.pandas_compatible") + ): + df.columns = [int(x) for x in df._column_names] + + # Set index if the index_col parameter is passed + if index_col is not None and index_col is not False: + if isinstance(index_col, int): + index_col_name = df._data.get_labels_by_index(index_col)[0] + df = df.set_index(index_col_name) + if ( + isinstance(index_col_name, str) + and names is None + and orig_header == "infer" + ): + if index_col_name.startswith("Unnamed:"): + # TODO: Try to upstream it to libcudf + # csv reader in future + df.index.name = None + elif names is None: + df.index.name = index_col + else: + df = df.set_index(index_col) + if dtype is None or isinstance(dtype, abc.Mapping): # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is None else dtype - unspecified_dtypes = { - name: dtype - for name, dtype in df._dtypes - if name not in specified_dtypes - } default_dtypes = {} - - for name, dt in unspecified_dtypes.items(): - if dt == np.dtype("i1"): + for name, dt in df._dtypes: + if name in specified_dtypes: + continue + elif dt == np.dtype("i1"): # csv reader reads all null column as int8. # The dtype should remain int8. default_dtypes[name] = dt else: default_dtypes[name] = _maybe_convert_to_default_type(dt) - df = df.astype(default_dtypes) + + if default_dtypes: + df = df.astype(default_dtypes) return df @@ -138,17 +319,17 @@ def read_csv( @_performance_tracking @ioutils.doc_to_csv() def to_csv( - df, + df: cudf.DataFrame, path_or_buf=None, - sep=",", - na_rep="", + sep: str = ",", + na_rep: str = "", columns=None, - header=True, - index=True, + header: bool = True, + index: bool = True, encoding=None, compression=None, - lineterminator="\n", - chunksize=None, + lineterminator: str = "\n", + chunksize: int | None = None, storage_options=None, ): """{docstring}""" @@ -187,15 +368,10 @@ def to_csv( ) for _, dtype in df._dtypes: - if isinstance(dtype, cudf.ListDtype): - raise NotImplementedError( - "Writing to csv format is not yet supported with " - "list columns." - ) - elif isinstance(dtype, cudf.StructDtype): + if isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)): raise NotImplementedError( "Writing to csv format is not yet supported with " - "Struct columns." + f"{dtype} columns." ) # TODO: Need to typecast categorical columns to the underlying @@ -208,7 +384,7 @@ def to_csv( df = df.copy(deep=False) for col_name, col in df._column_labels_and_values: if isinstance(col.dtype, cudf.CategoricalDtype): - df._data[col_name] = col.astype(col.categories.dtype) + df._data[col_name] = col.astype(col.dtype.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) @@ -218,7 +394,7 @@ def to_csv( if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - libcudf.csv.write_csv( + _plc_write_csv( df, path_or_buf=file_obj, sep=sep, @@ -229,7 +405,7 @@ def to_csv( index=index, ) else: - libcudf.csv.write_csv( + _plc_write_csv( df, path_or_buf=path_or_buf, sep=sep, @@ -243,3 +419,127 @@ def to_csv( if return_as_string: path_or_buf.seek(0) return path_or_buf.read() + + +@acquire_spill_lock() +def _plc_write_csv( + table: cudf.DataFrame, + path_or_buf=None, + sep: str = ",", + na_rep: str = "", + header: bool = True, + lineterminator: str = "\n", + rows_per_chunk: int = 8, + index: bool = True, +) -> None: + iter_columns = ( + itertools.chain(table.index._columns, table._columns) + if index + else table._columns + ) + columns = [col.to_pylibcudf(mode="read") for col in iter_columns] + col_names = [] + if header: + table_names = ( + na_rep if name is None or pd.isnull(name) else name + for name in table._column_names + ) + iter_names = ( + itertools.chain(table.index.names, table_names) + if index + else table_names + ) + all_names = list(iter_names) + col_names = [ + '""' + if (name in (None, "") and len(all_names) == 1) + else (str(name) if name not in (None, "") else "") + for name in all_names + ] + try: + plc.io.csv.write_csv( + ( + plc.io.csv.CsvWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc.Table(columns) + ) + .names(col_names) + .na_rep(na_rep) + .include_header(header) + .rows_per_chunk(rows_per_chunk) + .line_terminator(str(lineterminator)) + .inter_column_delimiter(str(sep)) + .true_value("True") + .false_value("False") + .build() + ) + ) + except OverflowError as err: + raise OverflowError( + f"Writing CSV file with chunksize={rows_per_chunk} failed. " + "Consider providing a smaller chunksize argument." + ) from err + + +def _validate_args( + delimiter: str | None, + sep: str, + delim_whitespace: bool, + decimal: str, + thousands: str | None, + nrows: int | None, + skipfooter: int, + byte_range: list[int] | tuple[int, int] | None, + skiprows: int, +) -> None: + if delim_whitespace: + if delimiter is not None: + raise ValueError("cannot set both delimiter and delim_whitespace") + if sep != ",": + raise ValueError("cannot set both sep and delim_whitespace") + + # Alias sep -> delimiter. + actual_delimiter = delimiter if delimiter else sep + + if decimal == actual_delimiter: + raise ValueError("decimal cannot be the same as delimiter") + + if thousands == actual_delimiter: + raise ValueError("thousands cannot be the same as delimiter") + + if nrows is not None and skipfooter != 0: + raise ValueError("cannot use both nrows and skipfooter parameters") + + if byte_range is not None: + if skipfooter != 0 or skiprows != 0 or nrows is not None: + raise ValueError( + "cannot manually limit rows to be read when using the byte range parameter" + ) + + +def _get_plc_data_type_from_dtype(dtype) -> plc.DataType: + # TODO: Remove this work-around Dictionary types + # in libcudf are fully mapped to categorical columns: + # https://github.com/rapidsai/cudf/issues/3960 + if isinstance(dtype, cudf.CategoricalDtype): + dtype = dtype.categories.dtype + elif dtype == "category": + dtype = "str" + + if isinstance(dtype, str): + if dtype == "date32": + return plc.DataType(plc.types.TypeId.TIMESTAMP_DAYS) + elif dtype in ("date", "date64"): + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp[us]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MICROSECONDS) + elif dtype == "timestamp[s]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_SECONDS) + elif dtype == "timestamp[ms]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp[ns]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_NANOSECONDS) + + dtype = cudf.dtype(dtype) + return dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 2382e9f12ed..153ee0fa01a 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1,6 +1,7 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations +import io import itertools import math import operator @@ -10,23 +11,42 @@ from collections import defaultdict from contextlib import ExitStack from functools import partial, reduce -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Literal from uuid import uuid4 import numpy as np import pandas as pd +import pyarrow as pa from pyarrow import dataset as ds +import pylibcudf as plc + import cudf -from cudf._lib import parquet as libparquet +from cudf._lib.column import Column +from cudf._lib.utils import ( + _data_from_columns, + _index_level_name, + data_from_pylibcudf_io, + generate_pandas_metadata, +) from cudf.api.types import is_list_like +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import as_column, column_empty from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking +try: + import ujson as json # type: ignore[import-untyped] +except ImportError: + import json + if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Hashable + + from typing_extensions import Self + + from cudf.core.column import ColumnBase BYTE_SIZES = { @@ -55,31 +75,200 @@ } +@acquire_spill_lock() +def _plc_write_parquet( + table, + filepaths_or_buffers, + index: bool | None = None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, + partitions_info=None, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, + write_arrow_schema: bool = False, +) -> np.ndarray | None: + """ + Cython function to call into libcudf API, see `write_parquet`. + + See Also + -------- + cudf.io.parquet.write_parquet + """ + if index is True or ( + index is None and not isinstance(table.index, cudf.RangeIndex) + ): + columns = itertools.chain(table.index._columns, table._columns) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + tbl_meta.column_metadata[level].set_name( + _index_level_name(idx_name, level, table._column_names) + ) + num_index_cols_meta = len(table.index.names) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + num_index_cols_meta = 0 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + if not isinstance(name, str): + if cudf.get_option("mode.pandas_compatible"): + tbl_meta.column_metadata[i].set_name(str(name)) + else: + raise ValueError( + "Writing a Parquet file requires string column names" + ) + else: + tbl_meta.column_metadata[i].set_name(name) + + _set_col_metadata( + table[name]._column, + tbl_meta.column_metadata[i], + force_nullable_schema, + None, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + if partitions_info is not None: + user_data = [ + { + "pandas": generate_pandas_metadata( + table.iloc[start_row : start_row + num_row].copy( + deep=False + ), + index, + ) + } + for start_row, num_row in partitions_info + ] + else: + user_data = [{"pandas": generate_pandas_metadata(table, index)}] + + if header_version not in ("1.0", "2.0"): + raise ValueError( + f"Invalid parquet header version: {header_version}. " + "Valid values are '1.0' and '2.0'" + ) + + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) + + comp_type = _get_comp_type(compression) + stat_freq = _get_stat_freq(statistics) + options = ( + plc.io.parquet.ParquetWriterOptions.builder( + plc.io.SinkInfo(filepaths_or_buffers), plc_table + ) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(comp_type) + .stats_level(stat_freq) + .int96_timestamps(int96_timestamps) + .write_v2_headers(header_version == "2.0") + .dictionary_policy(dict_policy) + .utc_timestamps(False) + .write_arrow_schema(write_arrow_schema) + .build() + ) + if partitions_info is not None: + options.set_partitions( + [ + plc.io.types.PartitionInfo(part[0], part[1]) + for part in partitions_info + ] + ) + if metadata_file_path is not None: + if is_list_like(metadata_file_path): + options.set_column_chunks_file_paths(metadata_file_path) + else: + options.set_column_chunks_file_paths([metadata_file_path]) + if row_group_size_bytes is not None: + options.set_row_group_size_bytes(row_group_size_bytes) + if row_group_size_rows is not None: + options.set_row_group_size_rows(row_group_size_rows) + if max_page_size_bytes is not None: + options.set_max_page_size_bytes(max_page_size_bytes) + if max_page_size_rows is not None: + options.set_max_page_size_rows(max_page_size_rows) + if max_dictionary_size is not None: + options.set_max_dictionary_size(max_dictionary_size) + blob = plc.io.parquet.write_parquet(options) + if metadata_file_path is not None: + return np.asarray(blob.obj) + else: + return None + + @_performance_tracking def _write_parquet( df, paths, - compression="snappy", - index=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + index: bool | None = None, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, partitions_info=None, storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, - write_arrow_schema=True, -): + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, + write_arrow_schema: bool = True, +) -> np.ndarray | None: if is_list_like(paths) and len(paths) > 1: if partitions_info is None: ValueError("partition info is required for multiple paths") @@ -124,11 +313,11 @@ def _write_parquet( file_objs = [ ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs ] - write_parquet_res = libparquet.write_parquet( + write_parquet_res = _plc_write_parquet( df, filepaths_or_buffers=file_objs, **common_args ) else: - write_parquet_res = libparquet.write_parquet( + write_parquet_res = _plc_write_parquet( df, filepaths_or_buffers=paths_or_bufs, **common_args ) @@ -141,26 +330,38 @@ def _write_parquet( def write_to_dataset( df, root_path, - compression="snappy", + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", filename=None, partition_cols=None, fs=None, - preserve_index=False, - return_metadata=False, - statistics="ROWGROUP", - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, + preserve_index: bool = False, + return_metadata: bool = False, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, store_schema=False, ): """Wraps `to_parquet` to write partitioned Parquet datasets. @@ -330,9 +531,29 @@ def write_to_dataset( return metadata +def _parse_metadata(meta) -> tuple[bool, Any, Any]: + file_is_range_index = False + file_index_cols = None + file_column_dtype = None + + if "index_columns" in meta and len(meta["index_columns"]) > 0: + file_index_cols = meta["index_columns"] + + if ( + isinstance(file_index_cols[0], dict) + and file_index_cols[0]["kind"] == "range" + ): + file_is_range_index = True + if "column_indexes" in meta and len(meta["column_indexes"]) == 1: + file_column_dtype = meta["column_indexes"][0]["numpy_type"] + return file_is_range_index, file_index_cols, file_column_dtype + + @ioutils.doc_read_parquet_metadata() @_performance_tracking -def read_parquet_metadata(filepath_or_buffer): +def read_parquet_metadata( + filepath_or_buffer, +) -> tuple[int, int, list[Hashable], int, list[dict[str, int]]]: """{docstring}""" # List of filepaths or buffers @@ -341,7 +562,39 @@ def read_parquet_metadata(filepath_or_buffer): bytes_per_thread=None, ) - return libparquet.read_parquet_metadata(filepaths_or_buffers) + parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(filepaths_or_buffers) + ) + + # read all column names including index column, if any + col_names = [ + info.name() for info in parquet_metadata.schema().root().children() + ] + + index_col_names = set() + json_str = parquet_metadata.metadata()["pandas"] + if json_str != "": + meta = json.loads(json_str) + file_is_range_index, index_col, _ = _parse_metadata(meta) + if not file_is_range_index and index_col is not None: + columns = meta["columns"] + for idx_col in index_col: + for c in columns: + if c["field_name"] == idx_col: + index_col_names.add(idx_col) + + # remove the index column from the list of column names + # only if index_col_names is not None + if len(index_col_names) >= 0: + col_names = [name for name in col_names if name not in index_col_names] + + return ( + parquet_metadata.num_rows(), + parquet_metadata.num_rowgroups(), + col_names, + len(col_names), + parquet_metadata.rowgroup_metadata(), + ) @_performance_tracking @@ -886,7 +1139,6 @@ def _parquet_to_frame( dfs[-1][name] = column_empty( row_count=_len, dtype=_dtype, - masked=True, ) else: dfs[-1][name] = as_column( @@ -913,16 +1165,18 @@ def _read_parquet( columns=None, row_groups=None, use_pandas_metadata=None, - nrows=None, - skip_rows=None, - allow_mismatched_pq_schemas=False, + nrows: int | None = None, + skip_rows: int | None = None, + allow_mismatched_pq_schemas: bool = False, *args, **kwargs, -): +) -> cudf.DataFrame: # Simple helper function to dispatch between # cudf and pyarrow to read parquet data if engine == "cudf": - if kwargs: + if set(kwargs.keys()).difference( + set(("_chunk_read_limit", "_pass_read_limit")) + ): raise ValueError( "cudf engine doesn't support the " f"following keyword arguments: {list(kwargs.keys())}" @@ -932,30 +1186,123 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) + if nrows is None: + nrows = -1 + if skip_rows is None: + skip_rows = 0 if cudf.get_option("io.parquet.low_memory"): - return libparquet.read_parquet_chunked( + # Note: If this function ever takes accepts filters + # allow_range_index needs to be False when a filter is passed + # (see read_parquet) + allow_range_index = columns is not None and len(columns) != 0 + + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + + reader = plc.io.parquet.ChunkedParquetReader( + options, + chunk_read_limit=kwargs.get("_chunk_read_limit", 0), + pass_read_limit=kwargs.get("_pass_read_limit", 1024000000), + ) + + tbl_w_meta = reader.read_chunk() + column_names = tbl_w_meta.column_names(include_children=False) + child_names = tbl_w_meta.child_names + per_file_user_data = tbl_w_meta.per_file_user_data + concatenated_columns = tbl_w_meta.tbl.columns() + + # save memory + del tbl_w_meta + + while reader.has_next(): + tbl = reader.read_chunk().tbl + + for i in range(tbl.num_columns()): + concatenated_columns[i] = plc.concatenate.concatenate( + [concatenated_columns[i], tbl._columns[i]] + ) + # Drop residual columns to save memory + tbl._columns[i] = None + + df = cudf.DataFrame._from_data( + *_data_from_columns( + columns=[ + Column.from_pylibcudf(plc) + for plc in concatenated_columns + ], + column_names=column_names, + index_names=None, + ) + ) + df = _process_metadata( + df, + column_names, + child_names, + per_file_user_data, + row_groups, filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - nrows=nrows if nrows is not None else -1, - skip_rows=skip_rows if skip_rows is not None else 0, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, + allow_range_index, + use_pandas_metadata, + nrows=nrows, + skip_rows=skip_rows, ) + return df else: - if nrows is None: - nrows = -1 - if skip_rows is None: - skip_rows = 0 - return libparquet.read_parquet( + allow_range_index = True + filters = kwargs.get("filters", None) + if columns is not None and len(columns) == 0 or filters: + allow_range_index = False + + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + if filters is not None: + options.set_filter(filters) + + tbl_w_meta = plc.io.parquet.read_parquet(options) + + df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta)) + + df = _process_metadata( + df, + tbl_w_meta.column_names(include_children=False), + tbl_w_meta.child_names, + tbl_w_meta.per_file_user_data, + row_groups, filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, + allow_range_index, + use_pandas_metadata, nrows=nrows, skip_rows=skip_rows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, ) + return df else: if ( isinstance(filepaths_or_buffers, list) @@ -980,28 +1327,40 @@ def to_parquet( df, path, engine="cudf", - compression="snappy", - index=None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + index: bool | None = None, partition_cols=None, partition_file_name=None, partition_offsets=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, storage_options=None, - return_metadata=False, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, + return_metadata: bool = False, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, store_schema=False, *args, **kwargs, @@ -1114,10 +1473,11 @@ def to_parquet( @ioutils.doc_merge_parquet_filemetadata() -def merge_parquet_filemetadata(filemetadata_list): +def merge_parquet_filemetadata(filemetadata_list: list) -> np.ndarray: """{docstring}""" - - return libparquet.merge_filemetadata(filemetadata_list) + return np.asarray( + plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj + ) def _generate_filename(): @@ -1205,10 +1565,207 @@ def _get_groups_and_offsets( return part_names, grouped_df, part_offsets -ParquetWriter = libparquet.ParquetWriter +class ParquetWriter: + """ + ParquetWriter lets you incrementally write out a Parquet file from a series + of cudf tables + + Parameters + ---------- + filepath_or_buffer : str, io.IOBase, os.PathLike, or list + File path or buffer to write to. The argument may also correspond + to a list of file paths or buffers. + index : bool or None, default None + If ``True``, include a dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. If ``None``, + index(es) other than RangeIndex will be saved as columns. + compression : {'snappy', None}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' + Level at which column statistics should be included in file. + row_group_size_bytes: int, default ``uint64 max`` + Maximum size of each stripe of the output. + By default, a virtually infinite size equal to ``uint64 max`` will be used. + row_group_size_rows: int, default 1000000 + Maximum number of rows of each stripe of the output. + By default, 1000000 (10^6 rows) will be used. + max_page_size_bytes: int, default 524288 + Maximum uncompressed size of each page of the output. + By default, 524288 (512KB) will be used. + max_page_size_rows: int, default 20000 + Maximum number of rows of each page of the output. + By default, 20000 will be used. + max_dictionary_size: int, default 1048576 + Maximum size of the dictionary page for each output column chunk. Dictionary + encoding for column chunks that exceeds this limit will be disabled. + By default, 1048576 (1MB) will be used. + use_dictionary : bool, default True + If ``True``, enable dictionary encoding for Parquet page data + subject to ``max_dictionary_size`` constraints. + If ``False``, disable dictionary encoding for Parquet page data. + store_schema : bool, default False + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section for faithful round-tripping. + + See Also + -------- + cudf.io.parquet.write_parquet + """ + + def __init__( + self, + filepath_or_buffer, + index: bool | None = None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + row_group_size_bytes: int = int(np.iinfo(np.uint64).max), + row_group_size_rows: int = 1000000, + max_page_size_bytes: int = 524288, + max_page_size_rows: int = 20000, + max_dictionary_size: int = 1048576, + use_dictionary: bool = True, + store_schema: bool = False, + ): + filepaths_or_buffers = ( + list(filepath_or_buffer) + if is_list_like(filepath_or_buffer) + else [filepath_or_buffer] + ) + self.sink = plc.io.SinkInfo(filepaths_or_buffers) + self.statistics = statistics + self.compression = compression + self.index = index + self.initialized = False + self.row_group_size_bytes = row_group_size_bytes + self.row_group_size_rows = row_group_size_rows + self.max_page_size_bytes = max_page_size_bytes + self.max_page_size_rows = max_page_size_rows + self.max_dictionary_size = max_dictionary_size + self.use_dictionary = use_dictionary + self.write_arrow_schema = store_schema + + def write_table(self, table, partitions_info=None) -> None: + """Writes a single table to the file""" + if not self.initialized: + self._initialize_chunked_state( + table, + num_partitions=len(partitions_info) if partitions_info else 1, + ) + if self.index is not False and ( + table.index.name is not None + or isinstance(table.index, cudf.MultiIndex) + ): + columns = itertools.chain(table.index._columns, table._columns) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.writer.write(plc_table, partitions_info) + + def close(self, metadata_file_path=None) -> np.ndarray | None: + if not self.initialized: + return None + column_chunks_file_paths = [] + if metadata_file_path is not None: + if is_list_like(metadata_file_path): + column_chunks_file_paths = list(metadata_file_path) + else: + column_chunks_file_paths = [metadata_file_path] + blob = self.writer.close(column_chunks_file_paths) + if metadata_file_path is not None: + return np.asarray(blob.obj) + return None + + def __enter__(self) -> Self: + return self + + def __exit__(self, *args) -> None: + self.close() + + def _initialize_chunked_state( + self, table, num_partitions: int = 1 + ) -> None: + """Prepares all the values required to build the + chunked_parquet_writer_options and creates a writer + """ + # Set the table_metadata + num_index_cols_meta = 0 + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + if self.index is not False: + if isinstance(table.index, cudf.MultiIndex): + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + self.tbl_meta.column_metadata[level].set_name(idx_name) + num_index_cols_meta = len(table.index.names) + else: + if table.index.name is not None: + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + self.tbl_meta.column_metadata[0].set_name(table.index.name) + num_index_cols_meta = 1 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + self.tbl_meta.column_metadata[i].set_name(name) + _set_col_metadata( + table[name]._column, + self.tbl_meta.column_metadata[i], + ) -def _parse_bytes(s): + index = ( + False if isinstance(table.index, cudf.RangeIndex) else self.index + ) + user_data = [ + {"pandas": generate_pandas_metadata(table, index)} + ] * num_partitions + comp_type = _get_comp_type(self.compression) + stat_freq = _get_stat_freq(self.statistics) + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if self.use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) + options = ( + plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(comp_type) + .stats_level(stat_freq) + .row_group_size_bytes(self.row_group_size_bytes) + .row_group_size_rows(self.row_group_size_rows) + .max_page_size_bytes(self.max_page_size_bytes) + .max_page_size_rows(self.max_page_size_rows) + .max_dictionary_size(self.max_dictionary_size) + .write_arrow_schema(self.write_arrow_schema) + .build() + ) + options.set_dictionary_policy(dict_policy) + self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options) + self.initialized = True + + +def _parse_bytes(s: str) -> int: """Parse byte string to numbers Utility function vendored from Dask. @@ -1345,8 +1902,8 @@ def __init__( path, partition_cols, index=None, - compression="snappy", - statistics="ROWGROUP", + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", max_file_size=None, file_name_prefix=None, storage_options=None, @@ -1370,9 +1927,7 @@ def __init__( self.partition_cols = partition_cols # Collection of `ParquetWriter`s, and the corresponding # partition_col values they're responsible for - self._chunked_writers: list[ - tuple[libparquet.ParquetWriter, list[str], str] - ] = [] + self._chunked_writers: list[tuple[ParquetWriter, list[str], str]] = [] # Map of partition_col values to their ParquetWriter's index # in self._chunked_writers for reverse lookup self.path_cw_map: dict[str, int] = {} @@ -1563,3 +2118,257 @@ def _hive_dirname(name, val): if pd.isna(val): val = "__HIVE_DEFAULT_PARTITION__" return f"{name}={val}" + + +def _set_col_metadata( + col: ColumnBase, + col_meta: plc.io.types.ColumnInMetadata, + force_nullable_schema: bool = False, + path: str | None = None, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, +) -> None: + need_path = ( + skip_compression is not None + or column_encoding is not None + or column_type_length is not None + or output_as_binary is not None + ) + name = col_meta.get_name() if need_path else None + full_path = ( + path + "." + name if (path is not None and name is not None) else name + ) + + if force_nullable_schema: + # Only set nullability if `force_nullable_schema` + # is true. + col_meta.set_nullability(True) + + if skip_compression is not None and full_path in skip_compression: + col_meta.set_skip_compression(True) + + if column_encoding is not None and full_path in column_encoding: + encoding = column_encoding[full_path] + if encoding is None: + c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT + else: + enc = str(encoding).upper() + c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) + if c_encoding is None: + raise ValueError("Unsupported `column_encoding` type") + col_meta.set_encoding(c_encoding) + + if column_type_length is not None and full_path in column_type_length: + col_meta.set_output_as_binary(True) + col_meta.set_type_length(column_type_length[full_path]) + + if output_as_binary is not None and full_path in output_as_binary: + col_meta.set_output_as_binary(True) + + if isinstance(col.dtype, cudf.StructDtype): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + col_meta.child(i).set_name(name) + _set_col_metadata( + child_col, + col_meta.child(i), + force_nullable_schema, + full_path, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + elif isinstance(col.dtype, cudf.ListDtype): + if full_path is not None: + full_path = full_path + ".list" + col_meta.child(1).set_name("element") + _set_col_metadata( + col.children[1], + col_meta.child(1), + force_nullable_schema, + full_path, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): + col_meta.set_decimal_precision(col.dtype.precision) + + +def _get_comp_type( + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None], +) -> plc.io.types.CompressionType: + if compression is None: + return plc.io.types.CompressionType.NONE + result = getattr(plc.io.types.CompressionType, compression.upper(), None) + if result is None: + raise ValueError("Unsupported `compression` type") + return result + + +def _get_stat_freq( + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"], +) -> plc.io.types.StatisticsFreq: + result = getattr( + plc.io.types.StatisticsFreq, f"STATISTICS_{statistics.upper()}", None + ) + if result is None: + raise ValueError("Unsupported `statistics_freq` type") + return result + + +def _process_metadata( + df: cudf.DataFrame, + names: list[Hashable], + child_names: dict, + per_file_user_data: list, + row_groups, + filepaths_or_buffers, + allow_range_index: bool, + use_pandas_metadata: bool, + nrows: int = -1, + skip_rows: int = 0, +) -> cudf.DataFrame: + ioutils._add_df_col_struct_names(df, child_names) + index_col = None + is_range_index = True + column_index_type = None + index_col_names = None + meta = None + for single_file in per_file_user_data: + if b"pandas" not in single_file: + continue + json_str = single_file[b"pandas"].decode("utf-8") + meta = json.loads(json_str) + file_is_range_index, index_col, column_index_type = _parse_metadata( + meta + ) + is_range_index &= file_is_range_index + + if ( + not file_is_range_index + and index_col is not None + and index_col_names is None + ): + index_col_names = {} + for idx_col in index_col: + for c in meta["columns"]: + if c["field_name"] == idx_col: + index_col_names[idx_col] = c["name"] + + if meta is not None: + # Book keep each column metadata as the order + # of `meta["columns"]` and `column_names` are not + # guaranteed to be deterministic and same always. + meta_data_per_column = { + col_meta["name"]: col_meta for col_meta in meta["columns"] + } + + # update the decimal precision of each column + for col in names: + if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): + df._data[col].dtype.precision = meta_data_per_column[col][ + "metadata" + ]["precision"] + + # Set the index column + if index_col is not None and len(index_col) > 0: + if is_range_index: + if not allow_range_index: + return df + + if len(per_file_user_data) > 1: + range_index_meta = { + "kind": "range", + "name": None, + "start": 0, + "stop": len(df), + "step": 1, + } + else: + range_index_meta = index_col[0] + + if row_groups is not None: + per_file_metadata = [ + pa.parquet.read_metadata( + # Pyarrow cannot read directly from bytes + io.BytesIO(s) if isinstance(s, bytes) else s + ) + for s in filepaths_or_buffers + ] + + filtered_idx = [] + for i, file_meta in enumerate(per_file_metadata): + row_groups_i = [] + start = 0 + for row_group in range(file_meta.num_row_groups): + stop = start + file_meta.row_group(row_group).num_rows + row_groups_i.append((start, stop)) + start = stop + + for rg in row_groups[i]: + filtered_idx.append( + cudf.RangeIndex( + start=row_groups_i[rg][0], + stop=row_groups_i[rg][1], + step=range_index_meta["step"], + ) + ) + + if len(filtered_idx) > 0: + idx = cudf.concat(filtered_idx) + else: + idx = cudf.Index._from_column( + cudf.core.column.column_empty(0) + ) + else: + start = range_index_meta["start"] + skip_rows # type: ignore[operator] + stop = range_index_meta["stop"] + if nrows > -1: + stop = start + nrows + idx = cudf.RangeIndex( + start=start, + stop=stop, + step=range_index_meta["step"], + name=range_index_meta["name"], + ) + + df.index = idx + elif set(index_col).issubset(names): + index_data = df[index_col] + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: + idx = cudf.Index._from_column( + index_data._columns[0], name=next(actual_index_names) + ) + else: + idx = cudf.MultiIndex.from_frame( + index_data, names=list(actual_index_names) + ) + df.drop(columns=index_col, inplace=True) + df.index = idx + else: + if use_pandas_metadata: + df.index.names = index_col + + if df._num_columns == 0 and column_index_type is not None: + df._data.label_dtype = cudf.dtype(column_index_type) + + return df diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl index 1ec077d10f7..64e06f0631d 100644 Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl differ diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 13efa71ebae..77d1f77d30b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -22,7 +22,6 @@ from pyarrow import parquet as pq import cudf -from cudf._lib.parquet import read_parquet_chunked from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.io.parquet import ( ParquetDatasetWriter, @@ -3775,13 +3774,14 @@ def test_parquet_chunked_reader( ) buffer = BytesIO() df.to_parquet(buffer, row_group_size=10000) - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - use_pandas_metadata=use_pandas_metadata, - row_groups=row_groups, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + use_pandas_metadata=use_pandas_metadata, + row_groups=row_groups, + ) expected = cudf.read_parquet( buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups ) @@ -3825,12 +3825,13 @@ def test_parquet_chunked_reader_structs( # Number of rows to read nrows = num_rows if num_rows is not None else len(df) - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - nrows=nrows, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + ) expected = cudf.read_parquet( buffer, nrows=nrows, @@ -3877,12 +3878,13 @@ def test_parquet_chunked_reader_string_decoders( nrows = num_rows if num_rows is not None else len(df) # Check with num_rows specified - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - nrows=nrows, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + ) expected = cudf.read_parquet( buffer, nrows=nrows, @@ -3982,13 +3984,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema): ).reset_index(drop=True) # Read with chunked reader (filter columns not supported) - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["list", "d_list", "str"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["list", "d_list", "str"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) # Construct the expected table without filter columns expected_chunked = cudf.concat( @@ -4054,13 +4057,14 @@ def test_parquet_reader_with_mismatched_structs(): ) # Read with chunked reader - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["struct.b.b_b.b_b_a"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["struct.b.b_b.b_b_a"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) got_chunked = ( cudf.Series(got_chunked["struct"]) .struct.field("b") diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 68f2aaf9cab..b50ed04427f 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import pytest +from packaging import version import cudf from cudf.testing import _utils as utils, assert_eq @@ -149,13 +150,19 @@ def test_serialize(df, to_host): def test_serialize_dtype_error_checking(): dtype = cudf.IntervalDtype("float", "right") - header, frames = dtype.serialize() - with pytest.raises(AssertionError): - # Invalid number of frames - type(dtype).deserialize(header, [None] * (header["frame_count"] + 1)) + # Must call device_serialize (not serialize) to ensure that the type metadata is + # encoded in the header. + header, frames = dtype.device_serialize() with pytest.raises(AssertionError): # mismatching class cudf.StructDtype.deserialize(header, frames) + # The is-cuda flag list length must match the number of frames + header["is-cuda"] = [False] + with pytest.raises(AssertionError): + # Invalid number of frames + type(dtype).deserialize( + header, [np.zeros(1)] * (header["frame_count"] + 1) + ) def test_serialize_dataframe(): @@ -382,6 +389,10 @@ def test_serialize_string_check_buffer_sizes(): assert expect == got +@pytest.mark.skipif( + version.parse(np.__version__) < version.parse("2.0.0"), + reason="The serialization of numpy 2.0 types is incompatible with numpy 1.x", +) def test_deserialize_cudf_23_12(datadir): fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl" diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 899d78c999b..b85943626a6 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -79,7 +79,7 @@ def test_series_construction_with_nulls(): ) def test_serialize_struct_dtype(fields): dtype = cudf.StructDtype(fields) - recreated = dtype.__class__.deserialize(*dtype.serialize()) + recreated = dtype.__class__.device_deserialize(*dtype.device_serialize()) assert recreated == dtype diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index d9a3da6666d..a04fcb8df7a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -43,7 +43,6 @@ } _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index e726b7fdca1..3891110e9d3 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -76,13 +76,6 @@ files: - py_version - test_base - test_xgboost - test_catboost: - output: none - includes: - - cuda_version - - py_version - - test_base - - test_catboost test_cuml: output: none includes: @@ -251,14 +244,6 @@ dependencies: - pip - pip: - xgboost>=2.0.1 - test_catboost: - common: - - output_types: conda - packages: - - numpy - - scipy - - scikit-learn - - catboost test_cuml: common: - output_types: conda diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py deleted file mode 100644 index 04cc69231fe..00000000000 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest -from catboost import CatBoostClassifier, CatBoostRegressor, Pool -from sklearn.datasets import make_classification, make_regression - -rng = np.random.default_rng(seed=42) - - -def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0): - if isinstance(expect, (tuple, list)): - assert len(expect) == len(got) - for e, g in zip(expect, got): - assert_catboost_equal(e, g, rtol, atol) - elif isinstance(expect, np.ndarray): - np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol) - elif isinstance(expect, pd.DataFrame): - pd.testing.assert_frame_equal(expect, got) - elif isinstance(expect, pd.Series): - pd.testing.assert_series_equal(expect, got) - else: - assert expect == got - - -pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal) - - -@pytest.fixture -def regression_data(): - X, y = make_regression(n_samples=100, n_features=10, random_state=42) - return pd.DataFrame(X), pd.Series(y) - - -@pytest.fixture -def classification_data(): - X, y = make_classification( - n_samples=100, n_features=10, n_classes=2, random_state=42 - ) - return pd.DataFrame(X), pd.Series(y) - - -def test_catboost_regressor_with_dataframe(regression_data): - X, y = regression_data - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(X, y) - predictions = model.predict(X) - return predictions - - -def test_catboost_regressor_with_numpy(regression_data): - X, y = regression_data - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(X.values, y.values) - predictions = model.predict(X.values) - return predictions - - -def test_catboost_classifier_with_dataframe(classification_data): - X, y = classification_data - model = CatBoostClassifier(iterations=10, verbose=0) - model.fit(X, y) - predictions = model.predict(X) - return predictions - - -def test_catboost_classifier_with_numpy(classification_data): - X, y = classification_data - model = CatBoostClassifier(iterations=10, verbose=0) - model.fit(X.values, y.values) - predictions = model.predict(X.values) - return predictions - - -def test_catboost_with_pool_and_dataframe(regression_data): - X, y = regression_data - train_pool = Pool(X, y) - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(train_pool) - predictions = model.predict(X) - return predictions - - -def test_catboost_with_pool_and_numpy(regression_data): - X, y = regression_data - train_pool = Pool(X.values, y.values) - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(train_pool) - predictions = model.predict(X.values) - return predictions - - -def test_catboost_with_categorical_features(): - data = { - "numerical_feature": rng.standard_normal(100), - "categorical_feature": rng.choice(["A", "B", "C"], size=100), - "target": rng.integers(0, 2, size=100), - } - df = pd.DataFrame(data) - X = df[["numerical_feature", "categorical_feature"]] - y = df["target"] - cat_features = ["categorical_feature"] - model = CatBoostClassifier( - iterations=10, verbose=0, cat_features=cat_features - ) - model.fit(X, y) - predictions = model.predict(X) - return predictions - - -@pytest.mark.parametrize( - "X, y", - [ - ( - pd.DataFrame(rng.standard_normal((100, 5))), - pd.Series(rng.standard_normal(100)), - ), - (rng.standard_normal((100, 5)), rng.standard_normal(100)), - ], -) -def test_catboost_train_test_split(X, y): - from sklearn.model_selection import train_test_split - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(X_train, y_train) - predictions = model.predict(X_test) - return len(X_train), len(X_test), len(y_train), len(y_test), predictions diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py index bef02c86355..8be48953974 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py @@ -71,6 +71,9 @@ def test_holoviews_heatmap(df): ) +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_holoviews_histogram(df): return get_plot_info(hv.Histogram(df.values)) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py index 1909392b9f7..c91808021e8 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py @@ -33,6 +33,9 @@ def assert_plots_equal(expect, got): pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal) +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_line(): df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]}) (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-") @@ -40,6 +43,9 @@ def test_line(): return plt.gca() +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_bar(): data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) ax = data.plot(kind="bar") diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py index 472f1889354..4d35d9e8946 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py @@ -37,6 +37,9 @@ def test_numpy_dot(df): return np.dot(df, df.T) +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_numpy_fft(sr): fft = np.fft.fft(sr) return fft diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py index ad287471aa0..7cea635afc4 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py @@ -116,6 +116,9 @@ def test_torch_train(data): return model(test_x1, test_x2) +@pytest.mark.skip( + reason="AssertionError: The values for attribute 'device' do not match: cpu != cuda:0." +) def test_torch_tensor_ctor(): s = pd.Series(range(5)) return torch.tensor(s.values) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py index 021c5bac9b7..f6a8a96ae3c 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py @@ -54,6 +54,9 @@ def test_scatter(df): return ax +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_lineplot_with_sns_data(): df = sns.load_dataset("flights") ax = sns.lineplot(data=df, x="month", y="passengers") diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py index 0777d982ac2..f275659288e 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py @@ -41,7 +41,7 @@ def test_multidimensional_distributed_timeseries(dask_client): rng = np.random.default_rng(seed=42) # Each row represents data from a different dimension while each column represents # data from the same dimension - your_time_series = rng.random(3, 1000) + your_time_series = rng.random((3, 1000)) # Approximately, how many data points might be found in a pattern window_size = 50 diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py index ba1f518cbfd..b4fad3024e7 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py @@ -271,6 +271,7 @@ def call(self, values): return tf.concat(values, axis=-1) +@pytest.mark.xfail(reason="ValueError: Invalid dtype: object") def test_full_example_train_with_df(df, target): # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example # Inputs are directly passed as dictionary of series diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py index 70f1e6a4250..0fd632507a6 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py @@ -113,6 +113,9 @@ def test_with_external_memory( return predt +@pytest.mark.skip( + reason="TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly." +) @pytest.mark.parametrize("device", ["cpu", "cuda"]) def test_predict(device: str) -> np.ndarray: reg = xgb.XGBRegressor(n_estimators=2, device=device) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index d03180852eb..c28b7e49207 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -4,7 +4,7 @@ import pytest import dask -from dask import dataframe as dd +from dask import array as da, dataframe as dd from dask.distributed import Client from distributed.utils_test import cleanup, loop, loop_in_thread # noqa: F401 @@ -121,3 +121,17 @@ def test_unique(): ddf.x.unique().compute(), check_index=False, ) + + +def test_serialization_of_numpy_types(): + # Dask uses numpy integers as column names, which can break cudf serialization + with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: + with Client(cluster): + with dask.config.set( + {"dataframe.backend": "cudf", "array.backend": "cupy"} + ): + rng = da.random.default_rng() + X_arr = rng.random((100, 10), chunks=(50, 10)) + X = dd.from_dask_array(X_arr) + X = X[X.columns[0]] + X.compute()