diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a4a8f036174..d7d14ea12ff 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -52,7 +52,7 @@ jobs: steps: - name: Get PR info id: get-pr-info - uses: rapidsai/shared-actions/get-pr-info@main + uses: nv-gha-runners/get-pr-info@main - name: Checkout code repo uses: actions/checkout@v4 with: diff --git a/.gitignore b/.gitignore index 619e1464b2a..180a6a286e2 100644 --- a/.gitignore +++ b/.gitignore @@ -80,7 +80,6 @@ build/ cpp/build/ cpp/examples/*/install/ cpp/examples/*/build/ -cpp/examples/tpch/datagen/datafusion cpp/include/cudf/ipc_generated/*.h cpp/thirdparty/googletest/ diff --git a/README.md b/README.md index f62f7885d63..8f8c2adac2f 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12 ### Conda -cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel: +cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: ```bash conda install -c rapidsai -c conda-forge -c nvidia \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7c387a6fc2a..26c086046a8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -385,6 +385,7 @@ add_library( src/io/json/nested_json_gpu.cu src/io/json/read_json.cu src/io/json/parser_features.cpp + src/io/json/process_tokens.cu src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d2c22b788cb..6c5f4a68a4c 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -36,25 +36,25 @@ target_include_directories( ) add_library( - tpch_data_generator STATIC - common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp - common/tpch_data_generator/random_column_generator.cu + ndsh_data_generator STATIC + common/ndsh_data_generator/ndsh_data_generator.cpp common/ndsh_data_generator/table_helpers.cpp + common/ndsh_data_generator/random_column_generator.cu ) -target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17) +target_compile_features(ndsh_data_generator PUBLIC cxx_std_17 cuda_std_17) target_compile_options( - tpch_data_generator PUBLIC "$<$:${CUDF_CXX_FLAGS}>" + ndsh_data_generator PUBLIC "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" ) target_link_libraries( - tpch_data_generator + ndsh_data_generator PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp PRIVATE $ ) target_include_directories( - tpch_data_generator + ndsh_data_generator PUBLIC "$" "$" "$" ) @@ -127,8 +127,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) INSTALL_RPATH "\$ORIGIN/../../../lib" ) target_link_libraries( - ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::nvbench - $ + ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen + nvbench::nvbench $ ) install( TARGETS ${CMAKE_BENCH_NAME} @@ -175,6 +175,14 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp) # * transpose benchmark --------------------------------------------------------------------------- ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp) +# ################################################################################################## +# * nds-h benchmark -------------------------------------------------------------------------------- +ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp) + # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( @@ -329,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp similarity index 97% rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp index 236fe8095ad..fa7edd225ba 100644 --- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "tpch_data_generator.hpp" +#include "ndsh_data_generator.hpp" #include "random_column_generator.hpp" #include "table_helpers.hpp" @@ -435,46 +435,37 @@ std::unique_ptr generate_lineitem_partial(cudf::table_view const& o columns.push_back(std::move(l_quantity)); columns.push_back(std::move(l_discount)); columns.push_back(std::move(l_tax)); + columns.push_back(std::move(l_returnflag)); + columns.push_back(std::move(l_linestatus)); columns.push_back(std::move(l_shipdate_ts)); columns.push_back(std::move(l_commitdate_ts)); columns.push_back(std::move(l_receiptdate_ts)); - columns.push_back(std::move(l_returnflag)); - columns.push_back(std::move(l_linestatus)); columns.push_back(std::move(l_shipinstruct)); columns.push_back(std::move(l_shipmode)); columns.push_back(std::move(l_comment)); return std::make_unique(std::move(columns)); } -std::unique_ptr generate_orders_dependent(cudf::table_view const& lineitem, +/** + * @brief Generate the part of the `orders` table dependent on the `lineitem` table + * + * @param lineitem_partial The partially generated `lineitem` table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ +std::unique_ptr generate_orders_dependent(cudf::table_view const& lineitem_partial, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto const l_linestatus_mask = lineitem.column(0); - auto const l_orderkey = lineitem.column(1); - auto const l_discount = lineitem.column(6); - auto const l_tax = lineitem.column(7); - auto const l_extendedprice = lineitem.column(16); + auto const l_linestatus_mask = lineitem_partial.column(0); + auto const l_orderkey = lineitem_partial.column(1); + auto const l_extendedprice = lineitem_partial.column(6); + auto const l_discount = lineitem_partial.column(7); + auto const l_tax = lineitem_partial.column(8); std::vector> orders_dependent_columns; - // Generate the `o_totalprice` column - // We calculate the `charge` column, which is a function of `l_extendedprice`, - // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge` - auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr); - auto o_totalprice = [&]() { - auto const keys = cudf::table_view({l_orderkey}); - cudf::groupby::groupby gb(keys); - std::vector requests; - requests.push_back(cudf::groupby::aggregation_request()); - requests[0].aggregations.push_back(cudf::make_sum_aggregation()); - requests[0].values = l_charge->view(); - auto agg_result = gb.aggregate(requests); - return cudf::round(agg_result.second[0].results[0]->view(), 2); - }(); - orders_dependent_columns.push_back(std::move(o_totalprice)); - // Generate the `o_orderstatus` column auto o_orderstatus = [&]() { auto const keys = cudf::table_view({l_orderkey}); @@ -529,6 +520,22 @@ std::unique_ptr generate_orders_dependent(cudf::table_view const& l cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view()); }(); orders_dependent_columns.push_back(std::move(o_orderstatus)); + + // Generate the `o_totalprice` column + // We calculate the `charge` column, which is a function of `l_extendedprice`, + // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge` + auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr); + auto o_totalprice = [&]() { + auto const keys = cudf::table_view({l_orderkey}); + cudf::groupby::groupby gb(keys); + std::vector requests; + requests.push_back(cudf::groupby::aggregation_request()); + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + requests[0].values = l_charge->view(); + auto agg_result = gb.aggregate(requests); + return cudf::round(agg_result.second[0].results[0]->view(), 2); + }(); + orders_dependent_columns.push_back(std::move(o_totalprice)); return std::make_unique(std::move(orders_dependent_columns)); } @@ -730,9 +737,7 @@ generate_orders_lineitem_part(double scale_factor, // Generate the `part` table auto part = generate_part(scale_factor, stream, mr); - // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column, - // add the column to the `lineitem` table, and write the `lineitem` table to a parquet file - + // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column auto l_extendedprice = [&]() { auto const left = cudf::table_view( {lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()}); @@ -752,8 +757,9 @@ generate_orders_lineitem_part(double scale_factor, return cudf::round(col->view(), 2); }(); + // Insert the `l_extendedprice` column into the partial columns of the `lineitem` table auto lineitem_partial_columns = lineitem_partial->release(); - lineitem_partial_columns.push_back(std::move(l_extendedprice)); + lineitem_partial_columns.insert(lineitem_partial_columns.begin() + 6, std::move(l_extendedprice)); auto lineitem_temp = std::make_unique(std::move(lineitem_partial_columns)); // Generate the dependent columns of the `orders` table @@ -762,7 +768,7 @@ generate_orders_lineitem_part(double scale_factor, auto orders_independent_columns = orders_independent->release(); auto orders_dependent_columns = orders_dependent->release(); - orders_independent_columns.insert(orders_independent_columns.end(), + orders_independent_columns.insert(orders_independent_columns.begin() + 2, std::make_move_iterator(orders_dependent_columns.begin()), std::make_move_iterator(orders_dependent_columns.end())); diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp similarity index 100% rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu similarity index 100% rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp similarity index 100% rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp similarity index 100% rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp similarity index 100% rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp index 61e79a47a50..e4ff0c8c4a7 100644 --- a/cpp/benchmarks/hashing/hash.cpp +++ b/cpp/benchmarks/hashing/hash.cpp @@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state) state.add_global_memory_reads(num_rows); // add memory read from bitmaks if (!no_nulls) { - state.add_global_memory_reads(2 * + state.add_global_memory_reads(2L * cudf::bitmask_allocation_size_bytes(num_rows)); } // memory written depends on used hash @@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state) }); } else if (hash_name == "md5") { // md5 creates a 32-byte string - state.add_global_memory_writes(32 * num_rows); + state.add_global_memory_writes(32L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); }); } else if (hash_name == "sha1") { // sha1 creates a 40-byte string - state.add_global_memory_writes(40 * num_rows); + state.add_global_memory_writes(40L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); }); } else if (hash_name == "sha224") { // sha224 creates a 56-byte string - state.add_global_memory_writes(56 * num_rows); + state.add_global_memory_writes(56L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); }); } else if (hash_name == "sha256") { // sha256 creates a 64-byte string - state.add_global_memory_writes(64 * num_rows); + state.add_global_memory_writes(64L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); }); } else if (hash_name == "sha384") { // sha384 creates a 96-byte string - state.add_global_memory_writes(96 * num_rows); + state.add_global_memory_writes(96L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); }); } else if (hash_name == "sha512") { // sha512 creates a 128-byte string - state.add_global_memory_writes(128 * num_rows); + state.add_global_memory_writes(128L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); }); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 7563c823454..ce115fd7723 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64; void parquet_read_common(cudf::size_type num_rows_to_read, cudf::size_type num_cols_to_read, cuio_source_sink_pair& source_sink, - nvbench::state& state) + nvbench::state& state, + size_t table_data_size = data_size) { cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); @@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read, }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(static_cast(table_data_size) / time, "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); @@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list +void BM_parquet_read_wide_tables(nvbench::state& state, + nvbench::type_list> type_list) +{ + auto const d_type = get_type_or_group(static_cast(DataType)); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + +void BM_parquet_read_wide_tables_mixed(nvbench::state& state) +{ + auto const d_type = []() { + auto d_type1 = get_type_or_group(static_cast(data_type::INTEGRAL)); + auto d_type2 = get_type_or_group(static_cast(data_type::FLOAT)); + d_type1.reserve(d_type1.size() + d_type2.size()); + std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1)); + return d_type1; + }(); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + using d_type_list = nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table)) + .set_name("parquet_read_wide_tables") + .set_min_samples(4) + .set_type_axes_names({"data_type"}) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + +NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed) + .set_name("parquet_read_wide_tables_mixed") + .set_min_samples(4) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + // a benchmark for structs that only contain fixed-width types using d_type_list_struct_only = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only)) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index 3abd4280081..7121cb9f034 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state) } std::tuple, size_t, size_t> write_file_data( - nvbench::state& state, std::vector const& d_types) + nvbench::state& state, std::vector const& d_types, io_type io_source_type) { cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); @@ -63,7 +63,7 @@ std::tuple, size_t, size_t> write_file_data( size_t total_file_size = 0; for (size_t i = 0; i < num_files; ++i) { - cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; + cuio_source_sink_pair source_sink{io_source_type}; auto const tbl = create_random_table( cycle_dtypes(d_types, num_cols), @@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, { size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, auto const num_threads = state.get_int64("num_threads"); size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .set_name("parquet_multithreaded_read_decode_fixed_width") @@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .set_name("parquet_multithreaded_read_decode_string") @@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .set_name("parquet_multithreaded_read_decode_list") @@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); // mixed data types: fixed width, strings NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) @@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .set_name("parquet_multithreaded_read_decode_chunked_fixed_width") @@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .set_name("parquet_multithreaded_read_decode_chunked_string") @@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .set_name("parquet_multithreaded_read_decode_chunked_list") @@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); diff --git a/cpp/benchmarks/ndsh/README.md b/cpp/benchmarks/ndsh/README.md new file mode 100644 index 00000000000..0a462e1684e --- /dev/null +++ b/cpp/benchmarks/ndsh/README.md @@ -0,0 +1,11 @@ +# NDS-H Benchmarks for `libcudf` + +## Disclaimer + +NDS-H is derived from the TPC-H Benchmarks and as such any results obtained using NDS-H are not +comparable to published TPC-H Benchmark results, as the results obtained from using NDS-H do not +comply with the TPC-H Benchmarks. + +## Current Status + +For now, only Q1, Q5, Q6, Q9, and Q10 have been implemented diff --git a/cpp/examples/tpch/q1.cpp b/cpp/benchmarks/ndsh/q01.cpp similarity index 82% rename from cpp/examples/tpch/q1.cpp rename to cpp/benchmarks/ndsh/q01.cpp index 87b7e613766..ef709926ae9 100644 --- a/cpp/examples/tpch/q1.cpp +++ b/cpp/benchmarks/ndsh/q01.cpp @@ -14,17 +14,19 @@ * limitations under the License. */ -#include "../utilities/timer.hpp" -#include "utils.hpp" +#include "utilities.hpp" #include +#include #include #include #include +#include + /** - * @file q1.cpp - * @brief Implement query 1 of the TPC-H benchmark. + * @file q01.cpp + * @brief Implement query 1 of the NDS-H benchmark. * * create view lineitem as select * from '/tables/scale-1/lineitem.parquet'; * @@ -59,7 +61,7 @@ * @param stream The CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -[[nodiscard]] std::unique_ptr calc_disc_price( +[[nodiscard]] std::unique_ptr calculate_disc_price( cudf::column_view const& discount, cudf::column_view const& extendedprice, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -86,7 +88,7 @@ * @param stream The CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -[[nodiscard]] std::unique_ptr calc_charge( +[[nodiscard]] std::unique_ptr calculate_charge( cudf::column_view const& tax, cudf::column_view const& disc_price, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -101,16 +103,9 @@ return charge; } -int main(int argc, char const** argv) +void run_ndsh_q1(nvbench::state& state, + std::unordered_map& sources) { - auto const args = parse_args(argc, argv); - - // Use a memory pool - auto resource = create_memory_resource(args.memory_resource_type); - cudf::set_current_device_resource(resource.get()); - - cudf::examples::timer timer; - // Define the column projections and filter predicate for `lineitem` table std::vector const lineitem_cols = {"l_returnflag", "l_linestatus", @@ -130,12 +125,12 @@ int main(int argc, char const** argv) // Read out the `lineitem` table from parquet file auto lineitem = - read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred)); + read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred)); // Calculate the discount price and charge columns and append to lineitem table auto disc_price = - calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice")); - auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view()); + calculate_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice")); + auto charge = calculate_charge(lineitem->column("l_tax"), disc_price->view()); (*lineitem).append(disc_price, "disc_price").append(charge, "charge"); // Perform the group by operation @@ -167,9 +162,21 @@ int main(int argc, char const** argv) {"l_returnflag", "l_linestatus"}, {cudf::order::ASCENDING, cudf::order::ASCENDING}); - timer.print_elapsed_millis(); - // Write query result to a parquet file orderedby_table->to_parquet("q1.parquet"); - return 0; } + +void ndsh_q1(nvbench::state& state) +{ + // Generate the required parquet files in device buffers + double const scale_factor = state.get_float64("scale_factor"); + std::unordered_map sources; + generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { run_ndsh_q1(state, sources); }); +} + +NVBENCH_BENCH(ndsh_q1).set_name("ndsh_q1").add_float64_axis("scale_factor", {0.01, 0.1, 1}); diff --git a/cpp/examples/tpch/q5.cpp b/cpp/benchmarks/ndsh/q05.cpp similarity index 80% rename from cpp/examples/tpch/q5.cpp rename to cpp/benchmarks/ndsh/q05.cpp index 12c186db10e..522bc4789c2 100644 --- a/cpp/examples/tpch/q5.cpp +++ b/cpp/benchmarks/ndsh/q05.cpp @@ -14,17 +14,19 @@ * limitations under the License. */ -#include "../utilities/timer.hpp" -#include "utils.hpp" +#include "utilities.hpp" #include +#include #include #include #include +#include + /** - * @file q5.cpp - * @brief Implement query 5 of the TPC-H benchmark. + * @file q05.cpp + * @brief Implement query 5 of the NDS-H benchmark. * * create view customer as select * from '/tables/scale-1/customer.parquet'; * create view orders as select * from '/tables/scale-1/orders.parquet'; @@ -67,7 +69,7 @@ * @param stream The CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -[[nodiscard]] std::unique_ptr calc_revenue( +[[nodiscard]] std::unique_ptr calculate_revenue( cudf::column_view const& extendedprice, cudf::column_view const& discount, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -86,16 +88,9 @@ return revenue; } -int main(int argc, char const** argv) +void run_ndsh_q5(nvbench::state& state, + std::unordered_map& sources) { - auto const args = parse_args(argc, argv); - - // Use a memory pool - auto resource = create_memory_resource(args.memory_resource_type); - cudf::set_current_device_resource(resource.get()); - - cudf::examples::timer timer; - // Define the column projection and filter predicate for the `orders` table std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; auto const o_orderdate_ref = cudf::ast::column_reference(std::distance( @@ -125,17 +120,17 @@ int main(int argc, char const** argv) // Read out the tables from parquet files // while pushing down the column projections and filter predicates auto const customer = - read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"}); + read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"}); auto const orders = - read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred)); - auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet", + read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred)); + auto const lineitem = read_parquet(sources["lineitem"].make_source_info(), {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"}); auto const supplier = - read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"}); + read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"}); auto const nation = - read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"}); + read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"}); auto const region = - read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred)); + read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred)); // Perform the joins auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"}); @@ -147,7 +142,7 @@ int main(int argc, char const** argv) // Calculate and append the `revenue` column auto revenue = - calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount")); + calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount")); (*joined_table).append(revenue, "revenue"); // Perform the groupby operation @@ -162,9 +157,22 @@ int main(int argc, char const** argv) auto const orderedby_table = apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING}); - timer.print_elapsed_millis(); - // Write query result to a parquet file orderedby_table->to_parquet("q5.parquet"); - return 0; } + +void ndsh_q5(nvbench::state& state) +{ + // Generate the required parquet files in device buffers + double const scale_factor = state.get_float64("scale_factor"); + std::unordered_map sources; + generate_parquet_data_sources( + scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { run_ndsh_q5(state, sources); }); +} + +NVBENCH_BENCH(ndsh_q5).set_name("ndsh_q5").add_float64_axis("scale_factor", {0.01, 0.1, 1}); diff --git a/cpp/examples/tpch/q6.cpp b/cpp/benchmarks/ndsh/q06.cpp similarity index 79% rename from cpp/examples/tpch/q6.cpp rename to cpp/benchmarks/ndsh/q06.cpp index 92dac40c768..04078547973 100644 --- a/cpp/examples/tpch/q6.cpp +++ b/cpp/benchmarks/ndsh/q06.cpp @@ -14,17 +14,20 @@ * limitations under the License. */ -#include "../utilities/timer.hpp" -#include "utils.hpp" +#include "utilities.hpp" #include +#include #include #include +#include #include +#include + /** - * @file q6.cpp - * @brief Implement query 6 of the TPC-H benchmark. + * @file q06.cpp + * @brief Implement query 6 of the NDS-H benchmark. * * create view lineitem as select * from '/tables/scale-1/lineitem.parquet'; * @@ -48,7 +51,7 @@ * @param stream The CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -[[nodiscard]] std::unique_ptr calc_revenue( +[[nodiscard]] std::unique_ptr calculate_revenue( cudf::column_view const& extendedprice, cudf::column_view const& discount, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -60,16 +63,9 @@ return revenue; } -int main(int argc, char const** argv) +void run_ndsh_q6(nvbench::state& state, + std::unordered_map& sources) { - auto const args = parse_args(argc, argv); - - // Use a memory pool - auto resource = create_memory_resource(args.memory_resource_type); - cudf::set_current_device_resource(resource.get()); - - cudf::examples::timer timer; - // Read out the `lineitem` table from parquet file std::vector const lineitem_cols = { "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"}; @@ -88,7 +84,7 @@ int main(int argc, char const** argv) auto const lineitem_pred = std::make_unique( cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b); auto lineitem = - read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred)); + read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred)); // Cast the discount and quantity columns to float32 and append to lineitem table auto discout_float = @@ -99,8 +95,8 @@ int main(int argc, char const** argv) (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float"); // Apply the filters - auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float")); - auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float")); + auto const discount_ref = cudf::ast::column_reference(lineitem->column_id("l_discount_float")); + auto const quantity_ref = cudf::ast::column_reference(lineitem->column_id("l_quantity_float")); auto discount_lower = cudf::numeric_scalar(0.05); auto const discount_lower_literal = cudf::ast::literal(discount_lower); @@ -123,16 +119,28 @@ int main(int argc, char const** argv) auto const filtered_table = apply_filter(lineitem, discount_quantity_pred); // Calculate the `revenue` column - auto revenue = - calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount")); + auto revenue = calculate_revenue(filtered_table->column("l_extendedprice"), + filtered_table->column("l_discount")); // Sum the `revenue` column auto const revenue_view = revenue->view(); auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue"); - timer.print_elapsed_millis(); - // Write query result to a parquet file result_table->to_parquet("q6.parquet"); - return 0; } + +void ndsh_q6(nvbench::state& state) +{ + // Generate the required parquet files in device buffers + double const scale_factor = state.get_float64("scale_factor"); + std::unordered_map sources; + generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { run_ndsh_q6(state, sources); }); +} + +NVBENCH_BENCH(ndsh_q6).set_name("ndsh_q6").add_float64_axis("scale_factor", {0.01, 0.1, 1}); diff --git a/cpp/examples/tpch/q9.cpp b/cpp/benchmarks/ndsh/q09.cpp similarity index 78% rename from cpp/examples/tpch/q9.cpp rename to cpp/benchmarks/ndsh/q09.cpp index 2882182aa2b..59218ab8912 100644 --- a/cpp/examples/tpch/q9.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "../utilities/timer.hpp" -#include "utils.hpp" +#include "utilities.hpp" +#include #include #include #include @@ -24,9 +24,11 @@ #include #include +#include + /** - * @file q9.cpp - * @brief Implement query 9 of the TPC-H benchmark. + * @file q09.cpp + * @brief Implement query 9 of the NDS-H benchmark. * * create view part as select * from '/tables/scale-1/part.parquet'; * create view supplier as select * from '/tables/scale-1/supplier.parquet'; @@ -79,7 +81,7 @@ * @param stream The CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -[[nodiscard]] std::unique_ptr calc_amount( +[[nodiscard]] std::unique_ptr calculate_amount( cudf::column_view const& discount, cudf::column_view const& extendedprice, cudf::column_view const& supplycost, @@ -109,28 +111,21 @@ return amount; } -int main(int argc, char const** argv) +void run_ndsh_q9(nvbench::state& state, + std::unordered_map& sources) { - auto const args = parse_args(argc, argv); - - // Use a memory pool - auto resource = create_memory_resource(args.memory_resource_type); - cudf::set_current_device_resource(resource.get()); - - cudf::examples::timer timer; - // Read out the table from parquet files auto const lineitem = read_parquet( - args.dataset_dir + "/lineitem.parquet", + sources["lineitem"].make_source_info(), {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"}); - auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"}); + auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"}); auto const orders = - read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"}); - auto const part = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"}); - auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet", + read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"}); + auto const part = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"}); + auto const partsupp = read_parquet(sources["partsupp"].make_source_info(), {"ps_suppkey", "ps_partkey", "ps_supplycost"}); auto const supplier = - read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"}); + read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"}); // Generating the `profit` table // Filter the part table using `p_name like '%green%'` @@ -150,10 +145,10 @@ int main(int argc, char const** argv) // Calculate the `nation`, `o_year`, and `amount` columns auto n_name = std::make_unique(joined_table->column("n_name")); auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate")); - auto amount = calc_amount(joined_table->column("l_discount"), - joined_table->column("l_extendedprice"), - joined_table->column("ps_supplycost"), - joined_table->column("l_quantity")); + auto amount = calculate_amount(joined_table->column("l_discount"), + joined_table->column("l_extendedprice"), + joined_table->column("ps_supplycost"), + joined_table->column("l_quantity")); // Put together the `profit` table std::vector> profit_columns; @@ -175,9 +170,22 @@ int main(int argc, char const** argv) auto const orderedby_table = apply_orderby( groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING}); - timer.print_elapsed_millis(); - // Write query result to a parquet file orderedby_table->to_parquet("q9.parquet"); - return 0; } + +void ndsh_q9(nvbench::state& state) +{ + // Generate the required parquet files in device buffers + double const scale_factor = state.get_float64("scale_factor"); + std::unordered_map sources; + generate_parquet_data_sources( + scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { run_ndsh_q9(state, sources); }); +} + +NVBENCH_BENCH(ndsh_q9).set_name("ndsh_q9").add_float64_axis("scale_factor", {0.01, 0.1, 1}); diff --git a/cpp/examples/tpch/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp similarity index 81% rename from cpp/examples/tpch/q10.cpp rename to cpp/benchmarks/ndsh/q10.cpp index fdf147b50e0..a520480020a 100644 --- a/cpp/examples/tpch/q10.cpp +++ b/cpp/benchmarks/ndsh/q10.cpp @@ -14,17 +14,19 @@ * limitations under the License. */ -#include "../utilities/timer.hpp" -#include "utils.hpp" +#include "utilities.hpp" #include +#include #include #include #include +#include + /** * @file q10.cpp - * @brief Implement query 10 of the TPC-H benchmark. + * @brief Implement query 10 of the NDS-H benchmark. * * create view customer as select * from '/tables/scale-1/customer.parquet'; * create view orders as select * from '/tables/scale-1/orders.parquet'; @@ -72,7 +74,7 @@ * @param stream The CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -[[nodiscard]] std::unique_ptr calc_revenue( +[[nodiscard]] std::unique_ptr calculate_revenue( cudf::column_view const& extendedprice, cudf::column_view const& discount, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -90,16 +92,10 @@ mr); return revenue; } -int main(int argc, char const** argv) -{ - auto const args = parse_args(argc, argv); - - // Use a memory pool - auto resource = create_memory_resource(args.memory_resource_type); - cudf::set_current_device_resource(resource.get()); - - cudf::examples::timer timer; +void run_ndsh_q10(nvbench::state& state, + std::unordered_map& sources) +{ // Define the column projection and filter predicate for the `orders` table std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; auto const o_orderdate_ref = cudf::ast::column_reference(std::distance( @@ -126,15 +122,15 @@ int main(int argc, char const** argv) // Read out the tables from parquet files // while pushing down the column projections and filter predicates auto const customer = read_parquet( - args.dataset_dir + "/customer.parquet", + sources["customer"].make_source_info(), {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"}); auto const orders = - read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred)); + read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred)); auto const lineitem = - read_parquet(args.dataset_dir + "/lineitem.parquet", + read_parquet(sources["lineitem"].make_source_info(), {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"}, std::move(lineitem_pred)); - auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"}); + auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"}); // Perform the joins auto const join_a = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"}); @@ -143,7 +139,7 @@ int main(int argc, char const** argv) // Calculate and append the `revenue` column auto revenue = - calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount")); + calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount")); (*joined_table).append(revenue, "revenue"); // Perform the groupby operation @@ -159,9 +155,22 @@ int main(int argc, char const** argv) auto const orderedby_table = apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING}); - timer.print_elapsed_millis(); - // Write query result to a parquet file orderedby_table->to_parquet("q10.parquet"); - return 0; } + +void ndsh_q10(nvbench::state& state) +{ + // Generate the required parquet files in device buffers + double const scale_factor = state.get_float64("scale_factor"); + std::unordered_map sources; + generate_parquet_data_sources( + scale_factor, {"customer", "orders", "lineitem", "nation"}, sources); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { run_ndsh_q10(state, sources); }); +} + +NVBENCH_BENCH(ndsh_q10).set_name("ndsh_q10").add_float64_axis("scale_factor", {0.01, 0.1, 1}); diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp new file mode 100644 index 00000000000..2d514764fc2 --- /dev/null +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utilities.hpp" + +#include "common/ndsh_data_generator/ndsh_data_generator.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace { + +std::vector const ORDERS_SCHEMA = {"o_orderkey", + "o_custkey", + "o_orderstatus", + "o_totalprice", + "o_orderdate", + "o_orderpriority", + "o_clerk", + "o_shippriority", + "o_comment"}; +std::vector const LINEITEM_SCHEMA = {"l_orderkey", + "l_partkey", + "l_suppkey", + "l_linenumber", + "l_quantity", + "l_extendedprice", + "l_discount", + "l_tax", + "l_returnflag", + "l_linestatus", + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "l_shipinstruct", + "l_shipmode", + "l_comment"}; +std::vector const PART_SCHEMA = {"p_partkey", + "p_name", + "p_mfgr", + "p_brand", + "p_type", + "p_size", + "p_container", + "p_retailprice", + "p_comment"}; +std::vector const PARTSUPP_SCHEMA = { + "ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"}; +std::vector const SUPPLIER_SCHEMA = { + "s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"}; +std::vector const CUSTOMER_SCHEMA = {"c_custkey", + "c_name", + "c_address", + "c_nationkey", + "c_phone", + "c_acctbal", + "c_mktsegment", + "c_comment"}; +std::vector const NATION_SCHEMA = { + "n_nationkey", "n_name", "n_regionkey", "n_comment"}; +std::vector const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"}; + +} // namespace + +cudf::table_view table_with_names::table() const { return tbl->view(); } + +cudf::column_view table_with_names::column(std::string const& col_name) const +{ + return tbl->view().column(column_id(col_name)); +} + +std::vector const& table_with_names::column_names() const { return col_names; } + +cudf::size_type table_with_names::column_id(std::string const& col_name) const +{ + auto it = std::find(col_names.begin(), col_names.end(), col_name); + if (it == col_names.end()) { + std::string err_msg = "Column `" + col_name + "` not found"; + throw std::runtime_error(err_msg); + } + return std::distance(col_names.begin(), it); +} + +table_with_names& table_with_names::append(std::unique_ptr& col, + std::string const& col_name) +{ + auto cols = tbl->release(); + cols.push_back(std::move(col)); + tbl = std::make_unique(std::move(cols)); + col_names.push_back(col_name); + return (*this); +} + +cudf::table_view table_with_names::select(std::vector const& col_names) const +{ + CUDF_FUNC_RANGE(); + std::vector col_indices; + for (auto const& col_name : col_names) { + col_indices.push_back(column_id(col_name)); + } + return tbl->select(col_indices); +} + +void table_with_names::to_parquet(std::string const& filepath) const +{ + CUDF_FUNC_RANGE(); + auto const sink_info = cudf::io::sink_info(filepath); + cudf::io::table_metadata metadata; + metadata.schema_info = + std::vector(col_names.begin(), col_names.end()); + auto const table_input_metadata = cudf::io::table_input_metadata{metadata}; + auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view()); + builder.metadata(table_input_metadata); + auto const options = builder.build(); + cudf::io::write_parquet(options); +} + +std::unique_ptr join_and_gather(cudf::table_view const& left_input, + cudf::table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls) +{ + CUDF_FUNC_RANGE(); + constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; + auto const left_selected = left_input.select(left_on); + auto const right_selected = right_input.select(right_on); + auto const [left_join_indices, right_join_indices] = cudf::inner_join( + left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref()); + + auto const left_indices_span = cudf::device_span{*left_join_indices}; + auto const right_indices_span = cudf::device_span{*right_join_indices}; + + auto const left_indices_col = cudf::column_view{left_indices_span}; + auto const right_indices_col = cudf::column_view{right_indices_span}; + + auto const left_result = cudf::gather(left_input, left_indices_col, oob_policy); + auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy); + + auto joined_cols = left_result->release(); + auto right_cols = right_result->release(); + joined_cols.insert(joined_cols.end(), + std::make_move_iterator(right_cols.begin()), + std::make_move_iterator(right_cols.end())); + return std::make_unique(std::move(joined_cols)); +} + +std::unique_ptr apply_inner_join( + std::unique_ptr const& left_input, + std::unique_ptr const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls) +{ + CUDF_FUNC_RANGE(); + std::vector left_on_indices; + std::vector right_on_indices; + std::transform( + left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) { + return left_input->column_id(col_name); + }); + std::transform(right_on.begin(), + right_on.end(), + std::back_inserter(right_on_indices), + [&](auto const& col_name) { return right_input->column_id(col_name); }); + auto table = join_and_gather( + left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls); + ; + std::vector merged_column_names; + merged_column_names.reserve(left_input->column_names().size() + + right_input->column_names().size()); + std::copy(left_input->column_names().begin(), + left_input->column_names().end(), + std::back_inserter(merged_column_names)); + std::copy(right_input->column_names().begin(), + right_input->column_names().end(), + std::back_inserter(merged_column_names)); + return std::make_unique(std::move(table), merged_column_names); + return std::make_unique(std::move(table), merged_column_names); +} + +std::unique_ptr apply_filter(std::unique_ptr const& table, + cudf::ast::operation const& predicate) +{ + CUDF_FUNC_RANGE(); + auto const boolean_mask = cudf::compute_column(table->table(), predicate); + auto result_table = cudf::apply_boolean_mask(table->table(), boolean_mask->view()); + return std::make_unique(std::move(result_table), table->column_names()); +} + +std::unique_ptr apply_mask(std::unique_ptr const& table, + std::unique_ptr const& mask) +{ + CUDF_FUNC_RANGE(); + auto result_table = cudf::apply_boolean_mask(table->table(), mask->view()); + return std::make_unique(std::move(result_table), table->column_names()); +} + +std::unique_ptr apply_groupby(std::unique_ptr const& table, + groupby_context_t const& ctx) +{ + CUDF_FUNC_RANGE(); + auto const keys = table->select(ctx.keys); + cudf::groupby::groupby groupby_obj(keys); + std::vector result_column_names; + result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end()); + std::vector requests; + for (auto& [value_col, aggregations] : ctx.values) { + requests.emplace_back(cudf::groupby::aggregation_request()); + for (auto& agg : aggregations) { + if (agg.first == cudf::aggregation::Kind::SUM) { + requests.back().aggregations.push_back( + cudf::make_sum_aggregation()); + } else if (agg.first == cudf::aggregation::Kind::MEAN) { + requests.back().aggregations.push_back( + cudf::make_mean_aggregation()); + } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) { + requests.back().aggregations.push_back( + cudf::make_count_aggregation()); + } else { + throw std::runtime_error("Unsupported aggregation"); + } + result_column_names.push_back(agg.second); + } + requests.back().values = table->column(value_col); + } + auto agg_results = groupby_obj.aggregate(requests); + std::vector> result_columns; + for (auto i = 0; i < agg_results.first->num_columns(); i++) { + auto col = std::make_unique(agg_results.first->get_column(i)); + result_columns.push_back(std::move(col)); + } + for (size_t i = 0; i < agg_results.second.size(); i++) { + for (size_t j = 0; j < agg_results.second[i].results.size(); j++) { + result_columns.push_back(std::move(agg_results.second[i].results[j])); + } + } + auto result_table = std::make_unique(std::move(result_columns)); + return std::make_unique(std::move(result_table), result_column_names); +} + +std::unique_ptr apply_orderby(std::unique_ptr const& table, + std::vector const& sort_keys, + std::vector const& sort_key_orders) +{ + CUDF_FUNC_RANGE(); + std::vector column_views; + for (auto& key : sort_keys) { + column_views.push_back(table->column(key)); + } + auto result_table = + cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders); + return std::make_unique(std::move(result_table), table->column_names()); +} + +std::unique_ptr apply_reduction(cudf::column_view const& column, + cudf::aggregation::Kind const& agg_kind, + std::string const& col_name) +{ + CUDF_FUNC_RANGE(); + auto const agg = cudf::make_sum_aggregation(); + auto const result = cudf::reduce(column, *agg, column.type()); + cudf::size_type const len = 1; + auto col = cudf::make_column_from_scalar(*result, len); + std::vector> columns; + columns.push_back(std::move(col)); + auto result_table = std::make_unique(std::move(columns)); + std::vector col_names = {col_name}; + return std::make_unique(std::move(result_table), col_names); +} + +std::unique_ptr read_parquet( + cudf::io::source_info const& source_info, + std::vector const& columns, + std::unique_ptr const& predicate) +{ + CUDF_FUNC_RANGE(); + auto builder = cudf::io::parquet_reader_options_builder(source_info); + if (!columns.empty()) { builder.columns(columns); } + if (predicate) { builder.filter(*predicate); } + auto const options = builder.build(); + auto table_with_metadata = cudf::io::read_parquet(options); + std::vector column_names; + for (auto const& col_info : table_with_metadata.metadata.schema_info) { + column_names.push_back(col_info.name); + } + return std::make_unique(std::move(table_with_metadata.tbl), column_names); +} + +std::tm make_tm(int year, int month, int day) +{ + std::tm tm{}; + tm.tm_year = year - 1900; + tm.tm_mon = month - 1; + tm.tm_mday = day; + return tm; +} + +int32_t days_since_epoch(int year, int month, int day) +{ + std::tm tm = make_tm(year, month, day); + std::tm epoch = make_tm(1970, 1, 1); + std::time_t time = std::mktime(&tm); + std::time_t epoch_time = std::mktime(&epoch); + double diff = std::difftime(time, epoch_time) / (60 * 60 * 24); + return static_cast(diff); +} + +void write_to_parquet_device_buffer(std::unique_ptr const& table, + std::vector const& col_names, + parquet_device_buffer& source) +{ + CUDF_FUNC_RANGE(); + auto const stream = cudf::get_default_stream(); + + // Prepare the table metadata + cudf::io::table_metadata metadata; + std::vector col_name_infos; + for (auto& col_name : col_names) { + col_name_infos.push_back(cudf::io::column_name_info(col_name)); + } + metadata.schema_info = col_name_infos; + auto const table_input_metadata = cudf::io::table_input_metadata{metadata}; + + // Declare a host and device buffer + std::vector h_buffer; + + // Write parquet data to host buffer + auto builder = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view()); + builder.metadata(table_input_metadata); + auto const options = builder.build(); + cudf::io::write_parquet(options); + + // Copy host buffer to device buffer + source.d_buffer.resize(h_buffer.size(), stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value())); +} + +void generate_parquet_data_sources(double scale_factor, + std::vector const& table_names, + std::unordered_map& sources) +{ + CUDF_FUNC_RANGE(); + std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) { + sources[table_name] = parquet_device_buffer(); + }); + + auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + + auto partsupp = cudf::datagen::generate_partsupp( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + + auto supplier = cudf::datagen::generate_supplier( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + + auto customer = cudf::datagen::generate_customer( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + + auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + auto region = cudf::datagen::generate_region(cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]); + write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]); + write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]); + write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]); + write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]); + write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]); + write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]); + write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]); +} diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp new file mode 100644 index 00000000000..762e43deccf --- /dev/null +++ b/cpp/benchmarks/ndsh/utilities.hpp @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +/** + * @brief A class to represent a table with column names attached + */ +class table_with_names { + public: + table_with_names(std::unique_ptr tbl, std::vector col_names) + : tbl(std::move(tbl)), col_names(col_names){}; + /** + * @brief Return the table view + */ + [[nodiscard]] cudf::table_view table() const; + /** + * @brief Return the column view for a given column name + * + * @param col_name The name of the column + */ + [[nodiscard]] cudf::column_view column(std::string const& col_name) const; + /** + * @param Return the column names of the table + */ + [[nodiscard]] std::vector const& column_names() const; + /** + * @brief Translate a column name to a column index + * + * @param col_name The name of the column + */ + [[nodiscard]] cudf::size_type column_id(std::string const& col_name) const; + /** + * @brief Append a column to the table + * + * @param col The column to append + * @param col_name The name of the appended column + */ + table_with_names& append(std::unique_ptr& col, std::string const& col_name); + /** + * @brief Select a subset of columns from the table + * + * @param col_names The names of the columns to select + */ + [[nodiscard]] cudf::table_view select(std::vector const& col_names) const; + /** + * @brief Write the table to a parquet file + * + * @param filepath The path to the parquet file + */ + void to_parquet(std::string const& filepath) const; + + private: + std::unique_ptr tbl; + std::vector col_names; +}; + +/** + * @brief Inner join two tables and gather the result + * + * @param left_input The left input table + * @param right_input The right input table + * @param left_on The columns to join on in the left table + * @param right_on The columns to join on in the right table + * @param compare_nulls The null equality policy + */ +[[nodiscard]] std::unique_ptr join_and_gather( + cudf::table_view const& left_input, + cudf::table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls); + +/** + * @brief Apply an inner join operation to two tables + * + * @param left_input The left input table + * @param right_input The right input table + * @param left_on The columns to join on in the left table + * @param right_on The columns to join on in the right table + * @param compare_nulls The null equality policy + */ +[[nodiscard]] std::unique_ptr apply_inner_join( + std::unique_ptr const& left_input, + std::unique_ptr const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls = cudf::null_equality::EQUAL); + +/** + * @brief Apply a filter predicate to a table + * + * @param table The input table + * @param predicate The filter predicate + */ +[[nodiscard]] std::unique_ptr apply_filter( + std::unique_ptr const& table, cudf::ast::operation const& predicate); + +/** + * @brief Apply a boolean mask to a table + * + * @param table The input table + * @param mask The boolean mask + */ +[[nodiscard]] std::unique_ptr apply_mask( + std::unique_ptr const& table, std::unique_ptr const& mask); + +/** + * Struct representing group by key columns, value columns, and the type of aggregations to perform + * on the value columns + */ +struct groupby_context_t { + std::vector keys; + std::unordered_map>> + values; +}; + +/** + * @brief Apply a groupby operation to a table + * + * @param table The input table + * @param ctx The groupby context + */ +[[nodiscard]] std::unique_ptr apply_groupby( + std::unique_ptr const& table, groupby_context_t const& ctx); + +/** + * @brief Apply an order by operation to a table + * + * @param table The input table + * @param sort_keys The sort keys + * @param sort_key_orders The sort key orders + */ +[[nodiscard]] std::unique_ptr apply_orderby( + std::unique_ptr const& table, + std::vector const& sort_keys, + std::vector const& sort_key_orders); + +/** + * @brief Apply a reduction operation to a column + * + * @param column The input column + * @param agg_kind The aggregation kind + * @param col_name The name of the output column + */ +[[nodiscard]] std::unique_ptr apply_reduction( + cudf::column_view const& column, + cudf::aggregation::Kind const& agg_kind, + std::string const& col_name); + +/** + * @brief Read a parquet file into a table + * + * @param source_info The source of the parquet file + * @param columns The columns to read + * @param predicate The filter predicate to pushdown + */ +[[nodiscard]] std::unique_ptr read_parquet( + cudf::io::source_info const& source_info, + std::vector const& columns = {}, + std::unique_ptr const& predicate = nullptr); + +/** + * @brief Generate the `std::tm` structure from year, month, and day + * + * @param year The year + * @param month The month + * @param day The day + */ +std::tm make_tm(int year, int month, int day); + +/** + * @brief Calculate the number of days since the UNIX epoch + * + * @param year The year + * @param month The month + * @param day The day + */ +int32_t days_since_epoch(int year, int month, int day); + +/** + * @brief Struct representing a parquet device buffer + */ +struct parquet_device_buffer { + parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {}; + cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); } + rmm::device_uvector d_buffer; +}; + +/** + * @brief Write a `cudf::table` to a parquet device buffer + * + * @param table The `cudf::table` to write + * @param col_names The column names of the table + * @param parquet_device_buffer The parquet device buffer to write the table to + */ +void write_to_parquet_device_buffer(std::unique_ptr const& table, + std::vector const& col_names, + parquet_device_buffer& source); + +/** + * @brief Generate NDS-H tables and write to parquet device buffers + * + * @param scale_factor The scale factor of NDS-H tables to generate + * @param table_names The names of the tables to generate + * @param sources The parquet data sources to populate + */ +void generate_parquet_data_sources(double scale_factor, + std::vector const& table_names, + std::unordered_map& sources); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp new file mode 100644 index 00000000000..adc3dddc59c --- /dev/null +++ b/cpp/benchmarks/text/word_minhash.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +static void bench_word_minhash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const base64 = state.get_int64("hash_type") == 64; + + data_profile const strings_profile = + data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); + auto strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + + auto const num_offsets = (num_rows / row_width) + 1; + auto offsets = cudf::sequence(num_offsets, + cudf::numeric_scalar(0), + cudf::numeric_scalar(row_width)); + + auto source = cudf::make_lists_column(num_offsets - 1, + std::move(offsets), + std::move(strings_table->release().front()), + 0, + rmm::device_buffer{}); + + data_profile const seeds_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::NORMAL, 0, 256); + auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); // output are hashes + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) + : nvtext::word_minhash(source->view(), seeds.view()); + }); +} + +NVBENCH_BENCH(bench_word_minhash) + .set_name("word_minhash") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) + .add_int64_axis("row_width", {10, 100, 1000}) + .add_int64_axis("seed_count", {2, 25}) + .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 8d206f245dc..6d1c91a5752 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. +By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) +- Line separator (Unicode: `2028`, UTF-8: `E280A8`) +- Next line (Unicode: `0085`, UTF-8: `C285`) +- Carriage return (Unicode: `000D`, UTF-8: `0D`) + **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following: - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals. - Unmatched paired special characters like `()`, `[]`, and `{}`. diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 8e8d8bd0b78..25984df1b60 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -57,7 +57,6 @@ build_example() { } build_example basic -build_example tpch build_example strings build_example nested_types build_example parquet_io diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt deleted file mode 100644 index 373a6d72d56..00000000000 --- a/cpp/examples/tpch/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -cmake_minimum_required(VERSION 3.26.4) - -include(../set_cuda_architecture.cmake) - -rapids_cuda_init_architectures(tpch_example) -rapids_cuda_set_architectures(RAPIDS) - -project( - tpch_example - VERSION 0.0.1 - LANGUAGES CXX CUDA -) - -include(../fetch_dependencies.cmake) - -add_executable(tpch_q1 q1.cpp) -target_link_libraries(tpch_q1 PRIVATE cudf::cudf) -target_compile_features(tpch_q1 PRIVATE cxx_std_17) - -add_executable(tpch_q5 q5.cpp) -target_link_libraries(tpch_q5 PRIVATE cudf::cudf) -target_compile_features(tpch_q5 PRIVATE cxx_std_17) - -add_executable(tpch_q6 q6.cpp) -target_link_libraries(tpch_q6 PRIVATE cudf::cudf) -target_compile_features(tpch_q6 PRIVATE cxx_std_17) - -add_executable(tpch_q9 q9.cpp) -target_link_libraries(tpch_q9 PRIVATE cudf::cudf) -target_compile_features(tpch_q9 PRIVATE cxx_std_17) - -add_executable(tpch_q10 q10.cpp) -target_link_libraries(tpch_q10 PRIVATE cudf::cudf) -target_compile_features(tpch_q10 PRIVATE cxx_std_17) diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md deleted file mode 100644 index 8c046c3f1e8..00000000000 --- a/cpp/examples/tpch/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# TPC-H Derived Examples - -Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format. - -## Requirements - -- Rust -- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment) - -## Running Queries - -1. Build the `libcudf` examples. -```bash -cd cudf/cpp/examples -./build.sh -``` -The TPC-H query binaries would be built inside `tpch/build`. - -2. Generate the dataset. -```bash -cd tpch/datagen -./datagen.sh [scale factor (1/10)] -``` - -The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`. - -3. Set these environment variables for optimized runtimes. -```bash -export KVIKIO_COMPAT_MODE="on" -export LIBCUDF_CUFILE_POLICY="KVIKIO" -export CUDA_MODULE_LOADING="EAGER" -``` - -4. Execute the queries. -```bash -./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)] -``` - -A parquet file named `q[query no].parquet` would be generated containing the results of the query. diff --git a/cpp/examples/tpch/datagen/correct_datatypes.py b/cpp/examples/tpch/datagen/correct_datatypes.py deleted file mode 100644 index 8564774647b..00000000000 --- a/cpp/examples/tpch/datagen/correct_datatypes.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -import os -import sys - -import pyarrow as pa -import pyarrow.parquet as pq -import pandas as pd - -if __name__ == "__main__": - dataset_path = str(sys.argv[1]) - tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"] - for table in tables: - filepath = os.path.join(dataset_path, f"{table}.parquet") - print("Reading file ", filepath) - - if filepath.endswith("lineitem.parquet"): - df = pd.read_parquet(filepath) - df["l_linenumber"] = df["l_linenumber"].astype("int64") - df["l_quantity"] = df["l_quantity"].astype("int64") - df["l_extendedprice"] = df["l_extendedprice"].astype("float64") - df["l_discount"] = df["l_discount"].astype("float64") - df["l_tax"] = df["l_tax"].astype("float64") - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") - - elif filepath.endswith("part.parquet"): - df = pd.read_parquet(filepath) - df["p_size"] = df["p_size"].astype("int64") - df["p_retailprice"] = df["p_retailprice"].astype("float64") - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") - - elif filepath.endswith("partsupp.parquet"): - df = pd.read_parquet(filepath) - df["ps_availqty"] = df["ps_availqty"].astype("int64") - df["ps_supplycost"] = df["ps_supplycost"].astype("float64") - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") - - elif filepath.endswith("orders.parquet"): - df = pd.read_parquet(filepath) - df["o_totalprice"] = df["o_totalprice"].astype("float64") - df["o_shippriority"] = df["o_shippriority"].astype("int64") - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") - - elif filepath.endswith("supplier.parquet"): - df = pd.read_parquet(filepath) - df["s_acctbal"] = df["s_acctbal"].astype("float64") - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") - - elif filepath.endswith("customer.parquet"): - df = pd.read_parquet(filepath) - df["c_acctbal"] = df["c_acctbal"].astype("float64") - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") - - elif filepath.endswith("nation.parquet"): - df = pd.read_parquet(filepath) - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") - - elif filepath.endswith("region.parquet"): - df = pd.read_parquet(filepath) - pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy") diff --git a/cpp/examples/tpch/datagen/datagen.sh b/cpp/examples/tpch/datagen/datagen.sh deleted file mode 100755 index 0b03753daea..00000000000 --- a/cpp/examples/tpch/datagen/datagen.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024, NVIDIA CORPORATION. - -set -e - -scale_factor=$1 -script_dir=$(pwd) - -# Clone the datafusion repository and apply a patch -# for single threaded data generation so that a -# single parquet file is generated for each table -rm -rf datafusion -git clone https://github.com/apache/datafusion.git datafusion -cd datafusion/ -git checkout 679a85f -git apply ${script_dir}/tpch.patch -cd benchmarks/ - -# Generate the data -# Currently, we support only scale factor 1 and 10 -if [ ${scale_factor} -eq 1 ]; then - ./bench.sh data tpch -elif [ ${scale_factor} -eq 10 ]; then - ./bench.sh data tpch10 -else - echo "Unsupported scale factor" - exit 1 -fi - -# Correct the datatypes of the parquet files -python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor} diff --git a/cpp/examples/tpch/datagen/tpch.patch b/cpp/examples/tpch/datagen/tpch.patch deleted file mode 100644 index 42727aa9904..00000000000 --- a/cpp/examples/tpch/datagen/tpch.patch +++ /dev/null @@ -1,33 +0,0 @@ -diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh -index 3b854f6dc..f000f09c0 100755 ---- a/benchmarks/bench.sh -+++ b/benchmarks/bench.sh -@@ -311,6 +311,15 @@ data_tpch() { - $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet - popd > /dev/null - fi -+ -+ cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet -+ cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet -+ cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet -+ cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet -+ cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet -+ cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet -+ cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet -+ cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet - } - - # Runs the tpch benchmark -diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs -index b5204b343..84fd2e78d 100644 ---- a/datafusion/common/src/config.rs -+++ b/datafusion/common/src/config.rs -@@ -250,7 +250,7 @@ config_namespace! { - /// concurrency. - /// - /// Defaults to the number of CPU cores on the system -- pub target_partitions: usize, default = num_cpus::get() -+ pub target_partitions: usize, default = 1 - - /// The default time zone - /// diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp deleted file mode 100644 index 8102fa8f976..00000000000 --- a/cpp/examples/tpch/utils.hpp +++ /dev/null @@ -1,458 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -// RMM memory resource creation utilities -inline auto make_cuda() { return std::make_shared(); } -inline auto make_pool() -{ - return rmm::mr::make_owning_wrapper( - make_cuda(), rmm::percent_of_free_device_memory(50)); -} -inline auto make_managed() { return std::make_shared(); } -inline auto make_managed_pool() -{ - return rmm::mr::make_owning_wrapper( - make_managed(), rmm::percent_of_free_device_memory(50)); -} -inline std::shared_ptr create_memory_resource( - std::string const& mode) -{ - if (mode == "cuda") return make_cuda(); - if (mode == "pool") return make_pool(); - if (mode == "managed") return make_managed(); - if (mode == "managed_pool") return make_managed_pool(); - CUDF_FAIL("Unknown rmm_mode parameter: " + mode + - "\nExpecting: cuda, pool, managed, or managed_pool"); -} - -/** - * @brief A class to represent a table with column names attached - */ -class table_with_names { - public: - table_with_names(std::unique_ptr tbl, std::vector col_names) - : tbl(std::move(tbl)), col_names(col_names) - { - } - /** - * @brief Return the table view - */ - [[nodiscard]] cudf::table_view table() const { return tbl->view(); } - /** - * @brief Return the column view for a given column name - * - * @param col_name The name of the column - */ - [[nodiscard]] cudf::column_view column(std::string const& col_name) const - { - return tbl->view().column(col_id(col_name)); - } - /** - * @param Return the column names of the table - */ - [[nodiscard]] std::vector column_names() const { return col_names; } - /** - * @brief Translate a column name to a column index - * - * @param col_name The name of the column - */ - [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const - { - CUDF_FUNC_RANGE(); - auto it = std::find(col_names.begin(), col_names.end(), col_name); - if (it == col_names.end()) { throw std::runtime_error("Column not found"); } - return std::distance(col_names.begin(), it); - } - /** - * @brief Append a column to the table - * - * @param col The column to append - * @param col_name The name of the appended column - */ - table_with_names& append(std::unique_ptr& col, std::string const& col_name) - { - CUDF_FUNC_RANGE(); - auto cols = tbl->release(); - cols.push_back(std::move(col)); - tbl = std::make_unique(std::move(cols)); - col_names.push_back(col_name); - return (*this); - } - /** - * @brief Select a subset of columns from the table - * - * @param col_names The names of the columns to select - */ - [[nodiscard]] cudf::table_view select(std::vector const& col_names) const - { - CUDF_FUNC_RANGE(); - std::vector col_indices; - for (auto const& col_name : col_names) { - col_indices.push_back(col_id(col_name)); - } - return tbl->select(col_indices); - } - /** - * @brief Write the table to a parquet file - * - * @param filepath The path to the parquet file - */ - void to_parquet(std::string const& filepath) const - { - CUDF_FUNC_RANGE(); - auto const sink_info = cudf::io::sink_info(filepath); - cudf::io::table_metadata metadata; - metadata.schema_info = - std::vector(col_names.begin(), col_names.end()); - auto const table_input_metadata = cudf::io::table_input_metadata{metadata}; - auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view()); - builder.metadata(table_input_metadata); - auto const options = builder.build(); - cudf::io::write_parquet(options); - } - - private: - std::unique_ptr tbl; - std::vector col_names; -}; - -/** - * @brief Concatenate two vectors - * - * @param lhs The left vector - * @param rhs The right vector - */ -template -std::vector concat(std::vector const& lhs, std::vector const& rhs) -{ - std::vector result; - result.reserve(lhs.size() + rhs.size()); - std::copy(lhs.begin(), lhs.end(), std::back_inserter(result)); - std::copy(rhs.begin(), rhs.end(), std::back_inserter(result)); - return result; -} - -/** - * @brief Inner join two tables and gather the result - * - * @param left_input The left input table - * @param right_input The right input table - * @param left_on The columns to join on in the left table - * @param right_on The columns to join on in the right table - * @param compare_nulls The null equality policy - */ -[[nodiscard]] std::unique_ptr join_and_gather( - cudf::table_view const& left_input, - cudf::table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - cudf::null_equality compare_nulls) -{ - CUDF_FUNC_RANGE(); - constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; - auto const left_selected = left_input.select(left_on); - auto const right_selected = right_input.select(right_on); - auto const [left_join_indices, right_join_indices] = cudf::inner_join( - left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref()); - - auto const left_indices_span = cudf::device_span{*left_join_indices}; - auto const right_indices_span = cudf::device_span{*right_join_indices}; - - auto const left_indices_col = cudf::column_view{left_indices_span}; - auto const right_indices_col = cudf::column_view{right_indices_span}; - - auto const left_result = cudf::gather(left_input, left_indices_col, oob_policy); - auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy); - - auto joined_cols = left_result->release(); - auto right_cols = right_result->release(); - joined_cols.insert(joined_cols.end(), - std::make_move_iterator(right_cols.begin()), - std::make_move_iterator(right_cols.end())); - return std::make_unique(std::move(joined_cols)); -} - -/** - * @brief Apply an inner join operation to two tables - * - * @param left_input The left input table - * @param right_input The right input table - * @param left_on The columns to join on in the left table - * @param right_on The columns to join on in the right table - * @param compare_nulls The null equality policy - */ -[[nodiscard]] std::unique_ptr apply_inner_join( - std::unique_ptr const& left_input, - std::unique_ptr const& right_input, - std::vector const& left_on, - std::vector const& right_on, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) -{ - CUDF_FUNC_RANGE(); - std::vector left_on_indices; - std::vector right_on_indices; - std::transform( - left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) { - return left_input->col_id(col_name); - }); - std::transform(right_on.begin(), - right_on.end(), - std::back_inserter(right_on_indices), - [&](auto const& col_name) { return right_input->col_id(col_name); }); - auto table = join_and_gather( - left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls); - return std::make_unique( - std::move(table), concat(left_input->column_names(), right_input->column_names())); -} - -/** - * @brief Apply a filter predicated to a table - * - * @param table The input table - * @param predicate The filter predicate - */ -[[nodiscard]] std::unique_ptr apply_filter( - std::unique_ptr const& table, cudf::ast::operation const& predicate) -{ - CUDF_FUNC_RANGE(); - auto const boolean_mask = cudf::compute_column(table->table(), predicate); - auto result_table = cudf::apply_boolean_mask(table->table(), boolean_mask->view()); - return std::make_unique(std::move(result_table), table->column_names()); -} - -/** - * @brief Apply a boolean mask to a table - * - * @param table The input table - * @param mask The boolean mask - */ -[[nodiscard]] std::unique_ptr apply_mask( - std::unique_ptr const& table, std::unique_ptr const& mask) -{ - CUDF_FUNC_RANGE(); - auto result_table = cudf::apply_boolean_mask(table->table(), mask->view()); - return std::make_unique(std::move(result_table), table->column_names()); -} - -struct groupby_context_t { - std::vector keys; - std::unordered_map>> - values; -}; - -/** - * @brief Apply a groupby operation to a table - * - * @param table The input table - * @param ctx The groupby context - */ -[[nodiscard]] std::unique_ptr apply_groupby( - std::unique_ptr const& table, groupby_context_t const& ctx) -{ - CUDF_FUNC_RANGE(); - auto const keys = table->select(ctx.keys); - cudf::groupby::groupby groupby_obj(keys); - std::vector result_column_names; - result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end()); - std::vector requests; - for (auto& [value_col, aggregations] : ctx.values) { - requests.emplace_back(cudf::groupby::aggregation_request()); - for (auto& agg : aggregations) { - if (agg.first == cudf::aggregation::Kind::SUM) { - requests.back().aggregations.push_back( - cudf::make_sum_aggregation()); - } else if (agg.first == cudf::aggregation::Kind::MEAN) { - requests.back().aggregations.push_back( - cudf::make_mean_aggregation()); - } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) { - requests.back().aggregations.push_back( - cudf::make_count_aggregation()); - } else { - throw std::runtime_error("Unsupported aggregation"); - } - result_column_names.push_back(agg.second); - } - requests.back().values = table->column(value_col); - } - auto agg_results = groupby_obj.aggregate(requests); - std::vector> result_columns; - for (size_t i = 0; i < agg_results.first->num_columns(); i++) { - auto col = std::make_unique(agg_results.first->get_column(i)); - result_columns.push_back(std::move(col)); - } - for (size_t i = 0; i < agg_results.second.size(); i++) { - for (size_t j = 0; j < agg_results.second[i].results.size(); j++) { - result_columns.push_back(std::move(agg_results.second[i].results[j])); - } - } - auto result_table = std::make_unique(std::move(result_columns)); - return std::make_unique(std::move(result_table), result_column_names); -} - -/** - * @brief Apply an order by operation to a table - * - * @param table The input table - * @param sort_keys The sort keys - * @param sort_key_orders The sort key orders - */ -[[nodiscard]] std::unique_ptr apply_orderby( - std::unique_ptr const& table, - std::vector const& sort_keys, - std::vector const& sort_key_orders) -{ - CUDF_FUNC_RANGE(); - std::vector column_views; - for (auto& key : sort_keys) { - column_views.push_back(table->column(key)); - } - auto result_table = - cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders); - return std::make_unique(std::move(result_table), table->column_names()); -} - -/** - * @brief Apply a reduction operation to a column - * - * @param column The input column - * @param agg_kind The aggregation kind - * @param col_name The name of the output column - */ -[[nodiscard]] std::unique_ptr apply_reduction( - cudf::column_view const& column, - cudf::aggregation::Kind const& agg_kind, - std::string const& col_name) -{ - CUDF_FUNC_RANGE(); - auto const agg = cudf::make_sum_aggregation(); - auto const result = cudf::reduce(column, *agg, column.type()); - cudf::size_type const len = 1; - auto col = cudf::make_column_from_scalar(*result, len); - std::vector> columns; - columns.push_back(std::move(col)); - auto result_table = std::make_unique(std::move(columns)); - std::vector col_names = {col_name}; - return std::make_unique(std::move(result_table), col_names); -} - -/** - * @brief Read a parquet file into a table - * - * @param filename The path to the parquet file - * @param columns The columns to read - * @param predicate The filter predicate to pushdown - */ -[[nodiscard]] std::unique_ptr read_parquet( - std::string const& filename, - std::vector const& columns = {}, - std::unique_ptr const& predicate = nullptr) -{ - CUDF_FUNC_RANGE(); - auto const source = cudf::io::source_info(filename); - auto builder = cudf::io::parquet_reader_options_builder(source); - if (!columns.empty()) { builder.columns(columns); } - if (predicate) { builder.filter(*predicate); } - auto const options = builder.build(); - auto table_with_metadata = cudf::io::read_parquet(options); - std::vector column_names; - for (auto const& col_info : table_with_metadata.metadata.schema_info) { - column_names.push_back(col_info.name); - } - return std::make_unique(std::move(table_with_metadata.tbl), column_names); -} - -/** - * @brief Generate the `std::tm` structure from year, month, and day - * - * @param year The year - * @param month The month - * @param day The day - */ -std::tm make_tm(int year, int month, int day) -{ - std::tm tm{}; - tm.tm_year = year - 1900; - tm.tm_mon = month - 1; - tm.tm_mday = day; - return tm; -} - -/** - * @brief Calculate the number of days since the UNIX epoch - * - * @param year The year - * @param month The month - * @param day The day - */ -int32_t days_since_epoch(int year, int month, int day) -{ - std::tm tm = make_tm(year, month, day); - std::tm epoch = make_tm(1970, 1, 1); - std::time_t time = std::mktime(&tm); - std::time_t epoch_time = std::mktime(&epoch); - double diff = std::difftime(time, epoch_time) / (60 * 60 * 24); - return static_cast(diff); -} - -struct tpch_example_args { - std::string dataset_dir; - std::string memory_resource_type; -}; - -/** - * @brief Parse command line arguments into a struct - * - * @param argc The number of command line arguments - * @param argv The command line arguments - */ -tpch_example_args parse_args(int argc, char const** argv) -{ - if (argc < 3) { - std::string usage_message = "Usage: " + std::string(argv[0]) + - " \n The query result will be " - "saved to a parquet file named q{query_no}.parquet in the current " - "working directory "; - throw std::runtime_error(usage_message); - } - tpch_example_args args; - args.dataset_dir = argv[1]; - args.memory_resource_type = argv[2]; - return args; -} diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index a3d6533705e..ff25a5bacae 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -128,6 +129,19 @@ class json_reader_options { // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; + // Validation checks for spark + // Should the json validation be strict or not + // Note: strict validation enforces the JSON specification https://www.json.org/json-en.html + bool _strict_validation = false; + // Allow leading zeros for numeric values. + bool _allow_numeric_leading_zeros = true; + // Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity + bool _allow_nonnumeric_numbers = true; + // Allow unquoted control characters + bool _allow_unquoted_control_chars = true; + // Additional values to recognize as null values + std::vector _na_values; + /** * @brief Constructor from source info. * @@ -298,6 +312,55 @@ class json_reader_options { */ [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; } + /** + * @brief Whether json validation should be enforced strictly or not. + * + * @return true if it should be. + */ + [[nodiscard]] bool is_strict_validation() const { return _strict_validation; } + + /** + * @brief Whether leading zeros are allowed in numeric values. + * + * @note: This validation is enforced only if strict validation is enabled. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_numeric_leading_zeros() const + { + return _allow_numeric_leading_zeros; + } + + /** + * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, + * and -Infinity. + * + * @note: This validation is enforced only if strict validation is enabled. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } + + /** + * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 + * be allowed without some form of escaping. + * + * @note: This validation is enforced only if strict validation is enabled. + * + * @return true if unquoted control chars are allowed. + */ + [[nodiscard]] bool is_allowed_unquoted_control_chars() const + { + return _allow_unquoted_control_chars; + } + + /** + * @brief Returns additional values to recognize as null values. + * + * @return Additional values to recognize as null values + */ + [[nodiscard]] std::vector const& get_na_values() const { return _na_values; } + /** * @brief Set data types for columns to be read. * @@ -427,6 +490,63 @@ class json_reader_options { * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. */ void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } + + /** + * @brief Set whether strict validation is enabled or not. + * + * @param val Boolean value to indicate whether strict validation is enabled. + */ + void set_strict_validation(bool val) { _strict_validation = val; } + + /** + * @brief Set whether leading zeros are allowed in numeric values. Strict validation + * must be enabled for this to work. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_numeric_leading_zeros(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_numeric_leading_zeros = val; + } + + /** + * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, + * Infinity, and -Infinity. Strict validation must be enabled for this to work. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_nonnumeric_numbers(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_nonnumeric_numbers = val; + } + + /** + * @brief Set whether in a quoted string should characters greater than or equal to 0 + * and less than 32 be allowed without some form of escaping. Strict validation must + * be enabled for this to work. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val true to indicate whether unquoted control chars are allowed. + */ + void allow_unquoted_control_chars(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_unquoted_control_chars = val; + } + + /** + * @brief Sets additional values to recognize as null values. + * + * @param vals Vector of values to be considered to be null + */ + void set_na_values(std::vector vals) { _na_values = std::move(vals); } }; /** @@ -638,6 +758,76 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether json validation should be strict or not. + * + * @param val Boolean value to indicate whether json validation should be strict or not. + * @return this for chaining + */ + json_reader_options_builder& strict_validation(bool val) + { + options.set_strict_validation(val); + return *this; + } + + /** + * @brief Set Whether leading zeros are allowed in numeric values. Strict validation must + * be enabled for this to have any effect. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + * @return this for chaining + */ + json_reader_options_builder& numeric_leading_zeros(bool val) + { + options.allow_numeric_leading_zeros(val); + return *this; + } + + /** + * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, + * +INF, -INF, +Infinity, Infinity, and -Infinity. + * Strict validation must be enabled for this to have any effect. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not. + * @return this for chaining + */ + json_reader_options_builder& nonnumeric_numbers(bool val) + { + options.allow_nonnumeric_numbers(val); + return *this; + } + + /** + * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without + * some form of escaping. Strict validation must be enabled for this to have any effect. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate if unquoted control chars are allowed or not. + * @return this for chaining + */ + json_reader_options_builder& unquoted_control_chars(bool val) + { + options.allow_unquoted_control_chars(val); + return *this; + } + + /** + * @brief Sets additional values to recognize as null values. + * + * @param vals Vector of values to be considered to be null + * @return this for chaining + */ + json_reader_options_builder& na_values(std::vector vals) + { + options.set_na_values(std::move(vals)); + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp index e7fe3cc7214..0d74a4158ad 100644 --- a/cpp/include/cudf/io/nvcomp_adapter.hpp +++ b/cpp/include/cudf/io/nvcomp_adapter.hpp @@ -24,7 +24,7 @@ namespace CUDF_EXPORT cudf { namespace io::nvcomp { -enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 }; +enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP }; /** * @brief Set of parameters that impact whether nvCOMP features are enabled. diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index f7108129dee..4f3fc7086f2 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -35,10 +35,11 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, ///< default - MULTILINE = 8, ///< the '^' and '$' honor new-line characters - DOTALL = 16, ///< the '.' matching includes new-line characters - ASCII = 256 ///< use only ASCII when matching built-in character classes + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16, ///< the '.' matching includes new-line characters + ASCII = 256, ///< use only ASCII when matching built-in character classes + EXT_NEWLINE = 512 ///< new-line matches extended characters }; /** @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f) return (f & regex_flags::ASCII) == regex_flags::ASCII; } +/** + * @brief Returns true if the given flags contain EXT_NEWLINE + * + * @param f Regex flags to check + * @return true if `f` includes EXT_NEWLINE + */ +constexpr bool is_ext_newline(regex_flags const f) +{ + return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE; +} + /** * @brief Capture groups setting * diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index abb26d7ccb4..14695c3bb27 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper __device__ inline string_view::const_iterator& string_view::const_iterator::operator--() { - if (byte_pos > 0) - while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) - ; + if (byte_pos > 0) { + if (byte_pos == char_pos) { + --byte_pos; + } else { + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + } + } --char_pos; return *this; } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index c83a4260c19..7c909f1a948 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -72,7 +73,7 @@ std::unique_ptr minhash( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -133,7 +134,7 @@ std::unique_ptr minhash64( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -150,5 +151,61 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x64_128 for the hash algorithm though + * only the first 64-bits of the hash are used in computing the output. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash64( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 261a8eb401d..c3187f73a95 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -44,6 +45,8 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... return nvcompBatchedLZ4DecompressGetTempSizeEx(std::forward(args)...); case compression_type::DEFLATE: return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward(args)...); + case compression_type::GZIP: + return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward(args)...); default: CUDF_FAIL("Unsupported compression type"); } } @@ -73,6 +76,8 @@ auto batched_decompress_async(compression_type compression, Args&&... args) case compression_type::DEFLATE: return nvcompBatchedDeflateDecompressAsync(std::forward(args)...); case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward(args)...); + case compression_type::GZIP: + return nvcompBatchedGzipDecompressAsync(std::forward(args)...); default: CUDF_FAIL("Unsupported compression type"); } } @@ -84,6 +89,7 @@ std::string compression_type_name(compression_type compression) case compression_type::ZSTD: return "Zstandard"; case compression_type::DEFLATE: return "Deflate"; case compression_type::LZ4: return "LZ4"; + case compression_type::GZIP: return "GZIP"; } return "compression_type(" + std::to_string(static_cast(compression)) + ")"; } @@ -359,8 +365,8 @@ std::optional is_compression_disabled_impl(compression_type compres return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable."; } return std::nullopt; + default: return "Unsupported compression type"; } - return "Unsupported compression type"; } std::optional is_compression_disabled(compression_type compression, @@ -396,7 +402,8 @@ std::optional is_decompression_disabled_impl(compression_type compr feature_status_parameters params) { switch (compression) { - case compression_type::DEFLATE: { + case compression_type::DEFLATE: + case compression_type::GZIP: { if (not params.are_all_integrations_enabled) { return "DEFLATE decompression is experimental, you can enable it through " "`LIBCUDF_NVCOMP_POLICY` environment variable."; @@ -447,6 +454,7 @@ std::optional is_decompression_disabled(compression_type compressio size_t required_alignment(compression_type compression) { switch (compression) { + case compression_type::GZIP: case compression_type::DEFLATE: return nvcompDeflateRequiredAlignment; case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment; case compression_type::ZSTD: return nvcompZstdRequiredAlignment; @@ -462,7 +470,7 @@ std::optional compress_max_allowed_chunk_size(compression_type compressi case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize; case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize; case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize; - default: return std::nullopt; + default: CUDF_FAIL("Unsupported compression type"); } } diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 7899ea7bac4..97d5884fef1 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -16,6 +16,7 @@ #include "io/fst/lookup_tables.cuh" +#include #include #include #include @@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer& inda rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; auto parser = @@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer& indata, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; auto parser = diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index d153cf5b909..49ad0cb1d34 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -226,6 +226,21 @@ std::pair, rmm::device_uvector> pr device_span token_indices, rmm::cuda_stream_view stream); +/** + * @brief Validate the tokens conforming to behavior given in options. + * + * @param d_input The string of input characters + * @param tokens The tokens to be post-processed + * @param token_indices The tokens' corresponding indices that are post-processed + * @param options Parsing options specifying the parsing behaviour + * @param stream The cuda stream to dispatch GPU kernels to + */ +void validate_token_stream(device_span d_input, + device_span tokens, + device_span token_indices, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream); + /** * @brief Parses the given JSON string and generates a tree representation of the given input. * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index d76e5447c30..4e513d3495c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1660,6 +1660,7 @@ std::pair, rmm::device_uvector> ge if (delimiter_offset == 1) { tokens.set_element(0, token_t::LineEnd, stream); + validate_token_stream(json_in, tokens, tokens_indices, options, stream); auto [filtered_tokens, filtered_tokens_indices] = process_token_stream(tokens, tokens_indices, stream); tokens = std::move(filtered_tokens); @@ -2082,7 +2083,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt parse_opts.keepquotes = options.is_enabled_keep_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + std::vector na_values{"", "null"}; + na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); + parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); return parse_opts; } diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu new file mode 100644 index 00000000000..83c7b663980 --- /dev/null +++ b/cpp/src/io/json/process_tokens.cu @@ -0,0 +1,310 @@ + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/utilities/trie.cuh" +#include "nested_json.hpp" +#include "tabulate_output_iterator.cuh" + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::io::json { +namespace detail { + +struct write_if { + using token_t = cudf::io::json::token_t; + using scan_type = thrust::pair; + PdaTokenT* tokens; + size_t n; + // Index, value + __device__ void operator()(size_type i, scan_type x) + { + if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) { + if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) { + tokens[i] = token_t::ErrorBegin; + } + } + } +}; + +enum class number_state { + START = 0, + SAW_NEG, // not a complete state + LEADING_ZERO, + WHOLE, + SAW_RADIX, // not a complete state + FRACTION, + START_EXPONENT, // not a complete state + AFTER_SIGN_EXPONENT, // not a complete state + EXPONENT +}; + +enum class string_state { + NORMAL = 0, + ESCAPED, // not a complete state + ESCAPED_U // not a complete state +}; + +__device__ inline bool substr_eq(const char* data, + SymbolOffsetT const start, + SymbolOffsetT const end, + SymbolOffsetT const expected_len, + const char* expected) +{ + if (end - start != expected_len) { return false; } + for (auto idx = 0; idx < expected_len; idx++) { + if (data[start + idx] != expected[idx]) { return false; } + } + return true; +} + +void validate_token_stream(device_span d_input, + device_span tokens, + device_span token_indices, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + if (!options.is_strict_validation()) { return; } + using token_t = cudf::io::json::token_t; + cudf::detail::optional_trie trie_na = + cudf::detail::create_serialized_trie(options.get_na_values(), stream); + auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto validate_values = cuda::proclaim_return_type( + [data = d_input.data(), + trie_na = trie_na_view, + allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates an unquoted value. A value must match https://www.json.org/json-en.html + // but the leading and training whitespace should already have been removed, and is not + // a string + auto c = data[start]; + auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); + if (is_null_literal) { + return true; + } else if ('n' == c) { + return substr_eq(data, start, end, 4, "null"); + } else if ('t' == c) { + return substr_eq(data, start, end, 4, "true"); + } else if ('f' == c) { + return substr_eq(data, start, end, 5, "false"); + } else if (allow_nonnumeric && c == 'N') { + return substr_eq(data, start, end, 3, "NaN"); + } else if (allow_nonnumeric && c == 'I') { + return substr_eq(data, start, end, 8, "Infinity"); + } else if (allow_nonnumeric && c == '+') { + return substr_eq(data, start, end, 4, "+INF") || + substr_eq(data, start, end, 9, "+Infinity"); + } else if ('-' == c || c <= '9' && 'c' >= '0') { + // number + auto num_state = number_state::START; + for (auto at = start; at < end; at++) { + c = data[at]; + switch (num_state) { + case number_state::START: + if ('-' == c) { + num_state = number_state::SAW_NEG; + } else if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else { + return false; + } + break; + case number_state::SAW_NEG: + if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else if (allow_nonnumeric && 'I' == c) { + return substr_eq(data, start, end, 4, "-INF") || + substr_eq(data, start, end, 9, "-Infinity"); + } else { + return false; + } + break; + case number_state::LEADING_ZERO: + if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::WHOLE: + if (c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::SAW_RADIX: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::FRACTION: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::START_EXPONENT: + if ('+' == c || '-' == c) { + num_state = number_state::AFTER_SIGN_EXPONENT; + } else if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::AFTER_SIGN_EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + } + } + return num_state != number_state::AFTER_SIGN_EXPONENT && + num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG && + num_state != number_state::SAW_RADIX; + } else { + return false; + } + }); + + auto validate_strings = cuda::proclaim_return_type( + [data = d_input.data(), + allow_unquoted_control_chars = + options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates a quoted string. A string must match https://www.json.org/json-en.html + // but we already know that it has a starting and ending " and all white space has been + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars + + auto state = string_state::NORMAL; + auto u_count = 0; + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && static_cast(c) >= 0 && static_cast(c) < 32) { + return false; + } + + switch (state) { + case string_state::NORMAL: + if (c == '\\') { state = string_state::ESCAPED; } + break; + case string_state::ESCAPED: + // in Spark you can allow any char to be escaped, but CUDF + // validates it in some cases so we need to also validate it. + if (c == 'u') { + state = string_state::ESCAPED_U; + u_count = 0; + } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || + c == 'r' || c == 't') { + state = string_state::NORMAL; + } else { + return false; + } + break; + case string_state::ESCAPED_U: + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + u_count++; + if (u_count == 4) { + state = string_state::NORMAL; + u_count = 0; + } + } else { + return false; + } + break; + } + } + return string_state::NORMAL == state; + }); + + auto num_tokens = tokens.size(); + auto count_it = thrust::make_counting_iterator(0); + auto predicate = [tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_values, + validate_strings] __device__(auto i) -> bool { + if (tokens[i] == token_t::ValueEnd) { + return !validate_values(token_indices[i - 1], token_indices[i]); + } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { + return !validate_strings(token_indices[i - 1], token_indices[i]); + } + return false; + }; + + using scan_type = write_if::scan_type; + auto conditional_write = write_if{tokens.begin(), num_tokens}; + auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); + auto transform_op = cuda::proclaim_return_type( + [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; + }); + auto binary_op = cuda::proclaim_return_type( + [] __device__(scan_type prev, scan_type curr) -> scan_type { + auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); + return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + }); + + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + count_it, + count_it + num_tokens, + conditional_output_it, + transform_op, + binary_op); // in-place scan +} +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh new file mode 100644 index 00000000000..7cf3655e259 --- /dev/null +++ b/cpp/src/io/json/tabulate_output_iterator.cuh @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace detail { + +// Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator +template +class tabulate_output_iterator_proxy { + public: + __host__ __device__ tabulate_output_iterator_proxy(const IndexT index, BinaryFunction fun) + : index(index), fun(fun) + { + } + template + __host__ __device__ tabulate_output_iterator_proxy operator=(const T& rhs_value) + { + fun(index, rhs_value); + return *this; + } + + private: + IndexT index; + BinaryFunction fun; +}; + +/** + * @brief Tabulate output iterator with custom binary function which takes index and value. + * + * @code {.cpp} + * #include "tabulate_output_iterator.cuh" + * #include + * #include + * #include + * + * struct set_bits_field { + * int* bitfield; + * __device__ inline void set_bit(size_t bit_index) + * { + * atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32))); + * } + * __device__ inline void clear_bit(size_t bit_index) + * { + * atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32))); + * } + * // Index, value + * __device__ void operator()(size_t i, bool x) + * { + * if (x) + * set_bit(i); + * else + * clear_bit(i); + * } + * }; + * + * thrust::device_vector v(1, 0x00000000); + * auto result_begin = thrust::make_tabulate_output_iterator(set_bits_field{v.data().get()}); + * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + * [] __device__ (int x) { return x%2; }); + * thrust::copy(thrust::device, value, value+32, result_begin); + * assert(v[0] == 0xaaaaaaaa); + * @endcode + * + * + * @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of + * assignment operator. + * @tparam Iterator iterator type that acts as index of the output. + */ +template +class tabulate_output_iterator + : public thrust::iterator_adaptor, + thrust::counting_iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + tabulate_output_iterator_proxy> { + public: + // parent class. + using super_t = thrust::iterator_adaptor, + thrust::counting_iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + tabulate_output_iterator_proxy>; + // friend thrust::iterator_core_access to allow it access to the private interface dereference() + friend class thrust::iterator_core_access; + __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {} + + private: + BinaryFunction fun; + + // thrust::iterator_core_access accesses this function + __host__ __device__ typename super_t::reference dereference() const + { + return tabulate_output_iterator_proxy(*this->base(), fun); + } +}; + +template +tabulate_output_iterator __host__ __device__ +make_tabulate_output_iterator(BinaryFunction fun) +{ + return tabulate_output_iterator(fun); +} // end make_tabulate_output_iterator + +} // namespace detail +} // namespace cudf + +// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from +// type_traits to enable its use with algorithms. +template +struct thrust::detail::is_proxy_reference< + cudf::detail::tabulate_output_iterator_proxy> + : public thrust::detail::true_type {}; diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index a3f91f6859b..9ed2929a70e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s, { bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; - constexpr int batch_size = 32; + constexpr int batch_size = cudf::detail::warp_size; int cur_leaf_count = target_leaf_count; while (s->error == 0 && s->nz_count < target_leaf_count && s->input_value_count < s->num_input_values) { diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 5d10472b0ae..7c985643887 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -203,10 +203,9 @@ struct SchemaElement { bool operator==(SchemaElement const& other) const { return type == other.type && converted_type == other.converted_type && - type_length == other.type_length && repetition_type == other.repetition_type && - name == other.name && num_children == other.num_children && - decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision && - field_id == other.field_id; + type_length == other.type_length && name == other.name && + num_children == other.num_children && decimal_scale == other.decimal_scale && + decimal_precision == other.decimal_precision && field_id == other.field_id; } // the parquet format is a little squishy when it comes to interpreting diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 125d35f6499..1390339c1ae 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -400,7 +400,8 @@ struct ColumnChunkDesc { int32_t src_col_schema_, column_chunk_info const* chunk_info_, float list_bytes_per_row_est_, - bool strings_to_categorical_) + bool strings_to_categorical_, + int32_t src_file_idx_) : compressed_data(compressed_data_), compressed_size(compressed_size_), num_values(num_values_), @@ -419,7 +420,8 @@ struct ColumnChunkDesc { src_col_schema(src_col_schema_), h_chunk_info(chunk_info_), list_bytes_per_row_est(list_bytes_per_row_est_), - is_strings_to_cat(strings_to_categorical_) + is_strings_to_cat(strings_to_categorical_), + src_file_idx(src_file_idx_) { } @@ -456,6 +458,7 @@ struct ColumnChunkDesc { bool is_strings_to_cat{}; // convert strings to hashes bool is_large_string_col{}; // `true` if string data uses 64-bit offsets + int32_t src_file_idx{}; // source file index }; /** diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 84f0dab0d8b..c588fedb85c 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -865,8 +865,18 @@ std::vector compute_page_splits_by_row(device_spanmap_schema_index(col.schema_idx, + rg.source_index); + }); it != columns.end()) { return std::distance(columns.begin(), it); } @@ -1525,7 +1538,8 @@ void reader::impl::create_global_chunk_info() auto col = _input_columns[i]; // look up metadata auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); + auto& schema = _metadata->get_schema( + _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index); auto [clock_rate, logical_type] = conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()), @@ -1564,9 +1578,9 @@ void reader::impl::create_global_chunk_info() col.schema_idx, chunk_info, list_bytes_per_row_est, - schema.type == BYTE_ARRAY and _strings_to_categorical)); + schema.type == BYTE_ARRAY and _strings_to_categorical, + rg.source_index)); } - // Adjust for skip_rows when updating the remaining rows after the first group remaining_rows -= (skip_rows) ? std::min(rg.start_row + row_group.num_rows - skip_rows, remaining_rows) diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 8b5678f202b..6d566b5815e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf std::vector chunks(rg.columns.size()); for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) { - auto const& col_chunk = rg.columns[col_idx]; - auto& schema = get_schema(col_chunk.schema_idx); + auto const& col_chunk = rg.columns[col_idx]; + auto const is_schema_idx_mapped = + is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index); + auto const mapped_schema_idx = is_schema_idx_mapped + ? map_schema_index(col_chunk.schema_idx, rg_info.source_index) + : col_chunk.schema_idx; + auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0); auto const max_def_level = schema.max_definition_level; auto const max_rep_level = schema.max_repetition_level; @@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata( num_rows(calc_num_rows()), num_row_groups(calc_num_row_groups()) { - // Validate that all sources have the same schema unless we are reading select columns - // from mismatched sources, in which case, we will only check the projected columns later. - if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) { - auto const& first_meta = per_file_metadata.front(); + if (per_file_metadata.size() > 1) { + auto& first_meta = per_file_metadata.front(); auto const num_cols = first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0; - auto const& schema = first_meta.schema; - - // Verify that the input files have matching numbers of columns and schema. - for (auto const& pfm : per_file_metadata) { - if (pfm.row_groups.size() > 0) { - CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(), - "All sources must have the same number of columns"); + auto& schema = first_meta.schema; + + // Validate that all sources have the same schema unless we are reading select columns + // from mismatched sources, in which case, we will only check the projected columns later. + if (not has_cols_from_mismatched_srcs) { + // Verify that the input files have matching numbers of columns and schema. + for (auto const& pfm : per_file_metadata) { + if (pfm.row_groups.size() > 0) { + CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(), + "All sources must have the same number of columns"); + } + CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema"); } - CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema"); } + + // Mark the column schema in the first (default) source as nullable if it is nullable in any of + // the input sources. This avoids recomputing this within build_column() and + // populate_metadata(). + std::for_each( + thrust::make_counting_iterator(static_cast(1)), + thrust::make_counting_iterator(schema.size()), + [&](auto const schema_idx) { + if (schema[schema_idx].repetition_type == REQUIRED and + std::any_of( + per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) { + return pfm.schema[schema_idx].repetition_type != REQUIRED; + })) { + schema[schema_idx].repetition_type = OPTIONAL; + } + }); } // Collect and apply arrow:schema from Parquet's key value metadata section @@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t size_type src_idx, int schema_idx) const { - // schema_idx_maps will only have > 0 size when we are reading matching column projection from - // mismatched Parquet sources. - if (src_idx and not schema_idx_maps.empty()) { - auto const& schema_idx_map = schema_idx_maps[src_idx - 1]; - CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(), - "Unmapped schema index encountered in the specified source tree", - std::range_error); - schema_idx = schema_idx_map.at(schema_idx); - } + // Map schema index to the provided source file index + schema_idx = map_schema_index(schema_idx, src_idx); auto col = std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(), @@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const return rg_metadata; } +bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const +{ + // Check if schema_idx or pfm_idx is invalid + CUDF_EXPECTS( + schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), + "Parquet reader encountered an invalid schema_idx or pfm_idx", + std::out_of_range); + + // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e. + // schemas are identical). + if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; } + + // Check if mapped + auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + return schema_idx_map.find(schema_idx) != schema_idx_map.end(); +} + +int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const +{ + // Check if schema_idx or pfm_idx is invalid + CUDF_EXPECTS( + schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), + "Parquet reader encountered an invalid schema_idx or pfm_idx", + std::out_of_range); + + // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e. + // schemas are identical). + if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; } + + // schema_idx_maps will only have > 0 size when we are reading matching column projection from + // mismatched Parquet sources. + auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(), + "Unmapped schema index encountered in the specified source tree", + std::out_of_range); + + // Return the mapped schema idx. + return schema_idx_map.at(schema_idx); +} + std::string aggregate_reader_metadata::get_pandas_index() const { // Assumes that all input files have the same metadata @@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns( // Compares two schema elements to be equal except their number of children auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) { return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and - lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and - lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and + lhs.type_length == rhs.type_length and lhs.name == rhs.name and + lhs.decimal_scale == rhs.decimal_scale and lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id; }; @@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns( "the selected path", std::invalid_argument); + // Get the schema_idx_map for this data source (pfm) + auto& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + // Map the schema index from 0th tree (src) to the one in the current (dst) tree. + schema_idx_map[src_schema_idx] = dst_schema_idx; + // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer // hierarchy. So continue on with mapping. if (src_schema_elem.is_stub()) { @@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns( pfm_idx); }); } - - // We're at a leaf and this is an input column (one with actual data stored) so map it. - if (src_schema_elem.num_children == 0) { - // Get the schema_idx_map for this data source (pfm) - auto& schema_idx_map = schema_idx_maps[pfm_idx - 1]; - - // Map the schema index from 0th tree (src) to the one in the current (dst) tree. - schema_idx_map[src_schema_idx] = dst_schema_idx; - } }; std::vector output_column_schemas; diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 6f2863136b2..6487c92f48f 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -234,6 +234,26 @@ class aggregate_reader_metadata { [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; } + /** + * @brief Checks if a schema index from 0th source is mapped to the specified file index + * + * @param schema_idx The index of the SchemaElement in the zeroth file. + * @param pfm_idx The index of the file (per_file_metadata) to check mappings for. + * + * @return True if schema index is mapped + */ + [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const; + + /** + * @brief Maps schema index from 0th source file to the specified file index + * + * @param schema_idx The index of the SchemaElement in the zeroth file. + * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to. + * + * @return Mapped schema index + */ + [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const; + /** * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file * @@ -248,7 +268,7 @@ class aggregate_reader_metadata { CUDF_EXPECTS( schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), "Parquet reader encountered an invalid schema_idx or pfm_idx", - std::invalid_argument); + std::out_of_range); return per_file_metadata[pfm_idx].schema[schema_idx]; } @@ -256,7 +276,10 @@ class aggregate_reader_metadata { [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); } /** - * @brief Gets the concrete nesting depth of output cudf columns + * @brief Gets the concrete nesting depth of output cudf columns. + * + * Gets the nesting depth of the output cudf column for the given schema. + * The nesting depth must be equal for the given schema_index across all sources. * * @param schema_index Schema index of the input column * diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 52918f5bc80..8e67f233213 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_str * is indicated when adding new values. This function generates the mappings of * the R/D levels to those start/end bounds * - * @param remap Maps column schema index to the R/D remapping vectors for that column - * @param src_col_schema The column schema to generate the new mapping for + * @param remap Maps column schema index to the R/D remapping vectors for that column for a + * particular input source file + * @param src_col_schema The source column schema to generate the new mapping for + * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file + * @param src_file_idx The input source file index for the column schema * @param md File metadata information */ -void generate_depth_remappings(std::map, std::vector>>& remap, - int src_col_schema, - aggregate_reader_metadata const& md) +void generate_depth_remappings( + std::map, std::pair, std::vector>>& remap, + int const src_col_schema, + int const mapped_src_col_schema, + int const src_file_idx, + aggregate_reader_metadata const& md) { // already generated for this level - if (remap.find(src_col_schema) != remap.end()) { return; } - auto schema = md.get_schema(src_col_schema); - int max_depth = md.get_output_nesting_depth(src_col_schema); + if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; } + auto const& schema = md.get_schema(mapped_src_col_schema, src_file_idx); + auto const max_depth = md.get_output_nesting_depth(src_col_schema); - CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(), + CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(), "Attempting to remap a schema more than once"); auto inserted = - remap.insert(std::pair, std::vector>>{src_col_schema, {}}); + remap.insert(std::pair, std::pair, std::vector>>{ + {src_col_schema, src_file_idx}, {}}); auto& depth_remap = inserted.first->second; std::vector& rep_depth_remap = (depth_remap.first); @@ -136,15 +143,15 @@ void generate_depth_remappings(std::map, std::ve auto find_shallowest = [&](int r) { int shallowest = -1; int cur_depth = max_depth - 1; - int schema_idx = src_col_schema; + int schema_idx = mapped_src_col_schema; while (schema_idx > 0) { - auto cur_schema = md.get_schema(schema_idx); + auto& cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_repetition_level == r) { // if this is a repeated field, map it one level deeper shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth; } // if it's one-level encoding list - else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) { + else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) { shallowest = cur_depth - 1; } if (!cur_schema.is_stub()) { cur_depth--; } @@ -159,10 +166,10 @@ void generate_depth_remappings(std::map, std::ve for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) { auto find_deepest = [&](int d) { SchemaElement prev_schema; - int schema_idx = src_col_schema; + int schema_idx = mapped_src_col_schema; int r1 = 0; while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); + SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_definition_level == d) { // if this is a repeated field, map it one level deeper r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level @@ -175,10 +182,10 @@ void generate_depth_remappings(std::map, std::ve // we now know R1 from above. return the deepest nesting level that has the // same repetition level - schema_idx = src_col_schema; + schema_idx = mapped_src_col_schema; int depth = max_depth - 1; while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); + SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_repetition_level == r1) { // if this is a repeated field, map it one level deeper depth = cur_schema.is_stub() ? depth + 1 : depth; @@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info() std::vector per_page_nesting_info_size(num_columns); auto iter = thrust::make_counting_iterator(size_type{0}); std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) { + // Schema index of the current input column auto const schema_idx = _input_columns[i].schema_idx; - auto const& schema = _metadata->get_schema(schema_idx); - return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx)); + // Get the max_definition_level of this column across all sources. + auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1; + std::for_each(thrust::make_counting_iterator(static_cast(1)), + thrust::make_counting_iterator(_sources.size()), + [&](auto const src_file_idx) { + auto const& schema = _metadata->get_schema( + _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx); + max_definition_level = + std::max(max_definition_level, schema.max_definition_level + 1); + }); + + return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx)); }); // compute total # of page_nesting infos needed and allocate space. doing this in one @@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info() page_nesting_decode_info.device_ptr() + src_info_index; pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx]; + // Set the number of output nesting levels from the zeroth source as nesting must be + // identical across sources. pages[target_page_index + p_idx].num_output_nesting_levels = _metadata->get_output_nesting_depth(src_col_schema); @@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info() target_page_index += subpass.column_page_count[idx]; } + // Reset the target_page_index + target_page_index = 0; + // fill in int nesting_info_index = 0; - std::map, std::vector>> depth_remapping; for (size_t idx = 0; idx < _input_columns.size(); idx++) { auto const src_col_schema = _input_columns[idx].schema_idx; - // schema of the input column - auto& schema = _metadata->get_schema(src_col_schema); // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc) + // nesting depth must be same across sources so getting it from the zeroth source is ok int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema); + // Map to store depths if this column has lists + std::map, std::pair, std::vector>> depth_remapping; // if this column has lists, generate depth remapping - std::map, std::vector>> depth_remapping; - if (schema.max_repetition_level > 0) { - generate_depth_remappings(depth_remapping, src_col_schema, *_metadata); - } + std::for_each( + thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(_sources.size()), + [&](auto const src_file_idx) { + auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx); + if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) { + generate_depth_remappings( + depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata); + } + }); // fill in host-side nesting info - int schema_idx = src_col_schema; + int schema_idx = src_col_schema; + // This is okay as we only use this to check stubness of cur_schema and + // to get its parent's indices, both of which are one to one mapped. auto cur_schema = _metadata->get_schema(schema_idx); int cur_depth = max_output_depth - 1; while (schema_idx > 0) { @@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) { + // Source file index for the current page. + auto const src_file_idx = + pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx; PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])]; @@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info() &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])]; + auto const mapped_src_col_schema = + _metadata->map_schema_index(src_col_schema, src_file_idx); // if we have lists, set our start and end depth remappings - if (schema.max_repetition_level > 0) { - auto remap = depth_remapping.find(src_col_schema); + if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) { + auto remap = depth_remapping.find({src_col_schema, src_file_idx}); CUDF_EXPECTS(remap != depth_remapping.end(), "Could not find depth remapping for schema"); std::vector const& rep_depth_remap = (remap->second.first); @@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info() } } + // Get the schema from the current input source. + auto& actual_cur_schema = _metadata->get_schema( + _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx); + // values indexed by output column index - nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level; + nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level; pni[cur_depth].size = 0; pni[cur_depth].type = - to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id()); + to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id()); pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL; } @@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info() cur_schema = _metadata->get_schema(schema_idx); } + // Offset the page and nesting info indices + target_page_index += subpass.column_page_count[idx]; nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]); } diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index adf650a4f27..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,15 +539,26 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return BOL; } case '$': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -959,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 3b899e4edc1..e34a1e12015 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() list2 = tmp; } +/** + * @brief Check for supported new-line characters + * + * '\n, \r, \u0085, \u2028, or \u2029' + */ +constexpr bool is_newline(char32_t const ch) +{ + return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); +} + /** * @brief Utility to check a specific character against this class instance. * @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (checkstart) { auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { - case BOL: - if (pos == 0) break; - if (jnk.startchar != '^') { return cuda::std::nullopt; } + case BOL: { + if (pos == 0) { break; } + if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; } + if (startchar != '\n') { break; } --itr; startchar = static_cast('\n'); + [[fallthrough]]; + } case CHAR: { auto const find_itr = find_char(startchar, dstr, itr); if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; } @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const id_activate = inst.u2.next_id; expanded = true; break; - case BOL: - if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) { + case BOL: { + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; + if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) || + ((inst.u1.c == 'S') && (is_newline(prev_c)))) { id_activate = inst.u2.next_id; expanded = true; } break; - case EOL: + } + case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line + bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || - ((c == '\n') && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) { + (nl && (inst.u1.c != 'Z') && + ((inst.u1.c == '$' || inst.u1.c == 'S') || + (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case BOW: case NBOW: { - auto const prev_c = pos > 0 ? dstr[pos - 1] : 0; + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; auto const word_class = reclass_device{CCLASS_W}; bool const curr_is_word = word_class.is_match(c, _codepoint_flags); bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags); @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (c != '\n') id_activate = inst.u2.next_id; - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; case NCCLASS: case CCLASS: { diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index 978a844c476..4c39fc96397 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings, break; } size_type const cc = (itr < end) && is_begin_utf8_char(*itr); - size_type const bc = (itr < end); + size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0; char_count += cg::reduce(warp, cc, cg::plus()); byte_count += cg::reduce(warp, bc, cg::plus()); itr += cudf::detail::warp_size; } + __syncwarp(); + if (warp.thread_rank() == 0) { if (start >= char_count) { d_output[str_idx] = string_index_pair{"", 0}; return; } - // we are just below start/stop and must now increment up to it from here + // we are just below start/stop and must now increment up to them from here auto first_byte = start_counts.second; if (start_counts.first < start) { auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte); first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first)); } - stop = max(stop, char_count); + stop = min(stop, char_count); auto last_byte = stop_counts.second; if (stop_counts.first < stop) { auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 605582f28a6..a03a34f5fa7 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -151,15 +153,111 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; minhash_kernel<<>>( *d_strings, seeds, width, d_hashes); return hashes; } -std::unique_ptr build_list_result(cudf::strings_column_view const& input, +/** + * @brief Compute the minhash of each list row of strings for each seed + * + * This is a warp-per-row algorithm where parallel threads within a warp + * work on strings in a single list row. + * + * @tparam HashFunction hash function to use on each string + * + * @param d_input List of strings to process + * @param seeds Seeds for hashing each string + * @param d_hashes Minhash output values (one per row) + */ +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, + cudf::device_span seeds, + hash_value_type* d_hashes) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = idx / cudf::detail::warp_size; + + if (row_idx >= d_input.size()) { return; } + if (d_input.is_null(row_idx)) { return; } + + auto const d_row = cudf::list_device_view(d_input, row_idx); + auto const d_output = d_hashes + (row_idx * seeds.size()); + + // initialize hashes output for this row + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + if (lane_idx == 0) { + auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); + } + __syncwarp(); + + // each lane hashes a string from the input row + for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { + auto const hash_str = + d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); + for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { + auto const hasher = HashFunction(seeds[seed_idx]); + // hash string and store the min value + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as requested by the LLM team. + hv = thrust::get<0>(hasher(hash_str)); + } + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hv, cuda::std::memory_order_relaxed); + } + } +} + +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_input = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(seeds.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_hashes = hashes->mutable_view().data(); + auto lcdv = cudf::detail::lists_column_device_view(*d_input); + + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; + minhash_word_kernel + <<>>(lcdv, seeds, d_hashes); + + return hashes; +} + +std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, rmm::cuda_stream_view stream, @@ -176,7 +274,7 @@ std::unique_ptr build_list_result(cudf::strings_column_view const& std::move(offsets), std::move(hashes), input.null_count(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input, stream, mr), stream, mr); // expect this condition to be very rare @@ -208,7 +306,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -232,7 +330,27 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } } // namespace detail @@ -276,4 +394,21 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash(input, seeds, stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash64(input, seeds, stream, mr); +} } // namespace nvtext diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 5f890722345..e512d28b625 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2180,6 +2180,86 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync) cudf::set_pinned_memory_resource(last_mr); } +// Validation +TEST_F(JsonReaderTest, ValueValidation) +{ + // parsing error as null rows + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2 }{})" + "\n" + // 1 -> (invalid) + R"({"b":{}should_be_invalid})" + "\n" + // 2 -> b (valid) + R"({"b":{"a":3} })" + "\n" + // 3 -> c: (valid/null based on option) + R"({"a": 1, "c":nan, "d": "null" } )" + "\n" + "\n" + // 4 -> (valid/null based on option) + R"({"a":04, "c": 1.23, "d": "abc"} 123)" + "\n" + // 5 -> (valid) + R"({"a":5}//Comment after record)" + "\n" + // 6 -> ((valid/null based on option) + R"({"a":06} //Comment after whitespace)" + "\n" + // 7 -> (invalid) + R"({"a":5 //Invalid Comment within record})"; + + // leadingZeros allowed + // na_values, + { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .strict_validation(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 4); + EXPECT_EQ(result.tbl->num_rows(), 8); + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, + {true, false, false, false, true, true, true, false}}; + auto b_column = cudf::test::structs_column_wrapper( + {b_a_col}, {false, false, true, false, false, false, false, false}); + auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0}, + {false, false, false, false, true, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column); + } + // leadingZeros not allowed, NaN allowed + { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .strict_validation(true) + .numeric_leading_zeros(false) + .na_values({"nan"}); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 4); + EXPECT_EQ(result.tbl->num_rows(), 8); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8); // empty column + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, + {true, false, false, true, false, true, false, false}}; + auto b_column = cudf::test::structs_column_wrapper( + {b_a_col}, {false, false, true, false, false, false, false, false}); + auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0}, + {false, false, false, false, false, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column); + } +} + TEST_F(JsonReaderTest, MixedTypes) { using LCWS = cudf::test::lists_column_wrapper; diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c816316d0ff..acf850c7a66 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } +TEST_F(StringsContainsTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", + "qqq\rzzé" LINE_SEPARATOR "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\nzzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto ml_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags); + + auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // including the DOTALL flag accepts the newline characters + auto dot_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::DOTALL); + prog = cudf::strings::regex_program::create(pattern, dot_flags); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, EndOfString) { auto input = cudf::test::strings_column_wrapper( diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index b26cbd5a549..1491da758d5 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include +#include #include #include @@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsExtractTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", + "qqq" LINE_SEPARATOR "zzé\rlll", + "zzé", + "", + "zzé" NEXT_LINE, + "abc" PARAGRAPH_SEPARATOR "zzé\n"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::extract(view, *prog); + auto expected = + cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::extract(view, *prog_ml); + expected = + cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""}, + {0, 1, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + // expect no matches here since the newline(s) interrupts the pattern + prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4582dcb1e38..47606b9b3ed 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé", + "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", + "zzé", + "", + "zzé\r", + "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::findall(view, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::findall(view, *prog_ml); + LCW expected_ml( + {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 8c0482653fb..9847d8d6bb5 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } +TEST_F(StringsReplaceRegexTest, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::replace_re(view, *prog, repl); + auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_re(view, *prog_ml, repl); + expected = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_", + "qqq" NEXT_LINE "_" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\r_\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^zzé$)"); + prog = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_with_backrefs(view, *prog, repl_template); + expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]", + "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll", + "[zzé]", + "", + "[zzé]" PARAGRAPH_SEPARATOR, + "abc\r[zzé]\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp index 52e439bd93f..7f7fd9d521b 100644 --- a/cpp/tests/strings/slice_tests.cpp +++ b/cpp/tests/strings/slice_tests.cpp @@ -268,6 +268,25 @@ TEST_F(StringsSliceTest, MaxPositions) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsSliceTest, MultiByteChars) +{ + auto input = cudf::test::strings_column_wrapper({ + // clang-format off + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "the following code snippet demonstrates how to use search for values in an ordered range " + // this placement tests proper multi-byte chars handling ------vvvvv + "it returns the last position where value could be inserted without the ééééé ordering ", + "algorithms execution is parallelized as determined by an execution policy; this is a 12345" + "continuation of previous row to make sure string boundaries are honored 012345678901234567" + // v--- this one also + "01234567890é34567890012345678901234567890" + // clang-format on + }); + + auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); +} + TEST_F(StringsSliceTest, Error) { cudf::test::strings_column_wrapper strings{"this string intentionally left blank"}; diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h new file mode 100644 index 00000000000..0d630f6bb52 --- /dev/null +++ b/cpp/tests/strings/special_chars.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::test { + +// special new-line characters for use with regex_flags::EXT_NEWLINE +#define NEXT_LINE "\xC2\x85" +#define LINE_SEPARATOR "\xE2\x80\xA8" +#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9" + +} // namespace cudf::test diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 7575a3ba846..e23f3f6e7d8 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, WordsMinHash) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto validity = cudf::test::iterators::null_at(1); + + LCWS input( + {LCWS({"hello", "abcdéfgh"}), + LCWS{}, + LCWS({"rapids", "moré", "test", "text"}), + LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, + validity); + + auto view = cudf::lists_column_view(input); + + auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); + auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); + using LCW32 = cudf::test::lists_column_wrapper; + LCW32 expected({LCW32{2069617641u, 1975382903u}, + LCW32{}, + LCW32{657297235u, 1010955999u}, + LCW32{644643885u, 310002789u}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); + auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); + using LCW64 = cudf::test::lists_column_wrapper; + LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, + LCW64{}, + LCW64{5331949571924938590ul, 2088583894581919741ul}, + LCW64{3400468157617183341ul, 2398577492366130055ul}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); diff --git a/dependencies.yaml b/dependencies.yaml index 483335c02ff..7a13043cc5f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -710,7 +710,16 @@ dependencies: - numpy==1.23.* - pandas==2.0.* - pyarrow==14.0.0 - - cupy==12.0.0 # ignored as pip constraint + - matrix: + packages: + - output_types: conda + matrices: + - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"} + packages: + - cupy==12.2.0 # cupy 12.2.0 is the earliest with CUDA 12 ARM packages. + - matrix: {dependencies: "oldest"} + packages: + - cupy==12.0.0 - matrix: packages: - output_types: requirements diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index 0398a8d7086..41838e01dd9 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the ```bash python -m cudf.pandas --profile script.py ``` + +### cudf.pandas CLI Features + +Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin. + +Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter. + +The `-c` flag accepts a code string to run, like this: + +```bash +$ python -m cudf.pandas -c "import pandas; print(pandas)" + +``` + +Users can also provide code to execute from stdin, like this: + +```bash +$ echo "import pandas; print(pandas)" | python -m cudf.pandas + +``` diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 2eaa75b3189..95f5f9734dd 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -5,9 +5,9 @@ "id": "4c6c548b", "metadata": {}, "source": [ - "# 10 Minutes to cuDF and Dask-cuDF\n", + "# 10 Minutes to cuDF and Dask cuDF\n", "\n", - "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n", + "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n", "\n", "## What are these Libraries?\n", "\n", @@ -18,13 +18,14 @@ "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n", "\n", "\n", - "> [!NOTE] \n", - "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n", + "
\n", + "Note: This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's configuration infrastructure to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the Dask cuDF documentation for more information.\n", + "
\n", "\n", "\n", - "## When to use cuDF and Dask-cuDF\n", + "## When to use cuDF and Dask cuDF\n", "\n", - "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF." + "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF." ] }, { @@ -115,7 +116,7 @@ "source": [ "ds = dask_cudf.from_cudf(s, npartitions=2)\n", "# Note the call to head here to show the first few entries, unlike\n", - "# cuDF objects, dask-cuDF objects do not have a printing\n", + "# cuDF objects, Dask-cuDF objects do not have a printing\n", "# representation that shows values since they may not be in local\n", "# memory.\n", "ds.head(n=3)" @@ -331,11 +332,11 @@ "id": "b17db919", "metadata": {}, "source": [ - "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n", + "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n", "\n", - "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n", + "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n", "\n", - "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*" + "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*" ] }, { @@ -1680,7 +1681,7 @@ "id": "7aa0089f", "metadata": {}, "source": [ - "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)." + "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)." ] }, { @@ -2393,7 +2394,7 @@ "id": "f6094cbe", "metadata": {}, "source": [ - "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." + "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." ] }, { @@ -3492,7 +3493,7 @@ "id": "5ac3b004", "metadata": {}, "source": [ - "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF." + "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF." ] }, { @@ -4181,7 +4182,7 @@ "id": "aa8a445b", "metadata": {}, "source": [ - "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert." + "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert." ] }, { @@ -4899,7 +4900,7 @@ "id": "787eae14", "metadata": {}, "source": [ - "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU." + "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU." ] }, { diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md index adcdaa51e7e..97b961b455b 100644 --- a/docs/cudf/source/user_guide/io/io.md +++ b/docs/cudf/source/user_guide/io/io.md @@ -75,7 +75,6 @@ IO format. - **Notes:** - \[¹\] - Not all orientations are GPU-accelerated. @@ -177,4 +176,9 @@ If no value is set, behavior will be the same as the "STABLE" option. +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ | DEFLATE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | Experimental | Experimental | ❌ | +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ + | LZ4 | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ + | GZIP | ❌ | ❌ | Experimental | Experimental | ❌ | ❌ | ❌ | ❌ | ❌ | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ + ``` diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 9a216690384..7fe6cbd45fa 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -3,39 +3,42 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to dask-cudf's documentation! +Welcome to Dask cuDF's documentation! ===================================== -**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension +**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension library for the `Dask `__ parallel computing -framework that provides a `cuDF -`__-backed distributed -dataframe with the same API as `Dask dataframes -`__. +framework. When installed, Dask cuDF is automatically registered +as the ``"cudf"`` dataframe backend for +`Dask DataFrame `__. + +.. note:: + Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU + or multi-node execution on their own. You must also deploy a + `dask.distributed ` cluster + to leverage multiple GPUs. We strongly recommend using `Dask-CUDA + `__ to simplify the + setup of the cluster, taking advantage of all features of the GPU + and networking hardware. If you are familiar with Dask and `pandas `__ or -`cuDF `__, then Dask-cuDF +`cuDF `__, then Dask cuDF should feel familiar to you. If not, we recommend starting with `10 minutes to Dask `__ followed -by `10 minutes to cuDF and Dask-cuDF +by `10 minutes to cuDF and Dask cuDF `__. -When running on multi-GPU systems, `Dask-CUDA -`__ is recommended to -simplify the setup of the cluster, taking advantage of all features of -the GPU and networking hardware. -Using Dask-cuDF +Using Dask cuDF --------------- -When installed, Dask-cuDF registers itself as a dataframe backend for -Dask. This means that in many cases, using cuDF-backed dataframes requires -only small changes to an existing workflow. The minimal change is to -select cuDF as the dataframe backend in :doc:`Dask's -configuration `. To do so, we must set the option -``dataframe.backend`` to ``cudf``. From Python, this can be achieved -like so:: +The Dask DataFrame API (Recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Simply use the `Dask configuration ` system to +set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, +this can be achieved like so:: import dask @@ -44,52 +47,157 @@ like so:: Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the environment before running your code. -Dataframe creation from on-disk formats -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If your workflow creates Dask dataframes from on-disk formats -(for example using :func:`dask.dataframe.read_parquet`), then setting -the backend may well be enough to migrate your workflow. - -For example, consider reading a dataframe from parquet:: +Once this is done, the public Dask DataFrame API will leverage +``cudf`` automatically when a new DataFrame collection is created +from an on-disk format using any of the following ``dask.dataframe`` +functions:: - import dask.dataframe as dd +* :func:`dask.dataframe.read_parquet` +* :func:`dask.dataframe.read_json` +* :func:`dask.dataframe.read_csv` +* :func:`dask.dataframe.read_orc` +* :func:`dask.dataframe.read_hdf` +* :func:`dask.dataframe.from_dict` - # By default, we obtain a pandas-backed dataframe - df = dd.read_parquet("data.parquet", ...) +For example:: + import dask.dataframe as dd -To obtain a cuDF-backed dataframe, we must set the -``dataframe.backend`` configuration option:: + # By default, we obtain a pandas-backed dataframe + df = dd.read_parquet("data.parquet", ...) import dask - import dask.dataframe as dd dask.config.set({"dataframe.backend": "cudf"}) - # This gives us a cuDF-backed dataframe + # This now gives us a cuDF-backed dataframe df = dd.read_parquet("data.parquet", ...) -This code will use cuDF's GPU-accelerated :func:`parquet reader -` to read partitions of the data. +When other functions are used to create a new collection +(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`, +and :func:`from_array`), the backend of the new collection will +depend on the inputs to those functions. For example:: + + import pandas as pd + import cudf + + # This gives us a pandas-backed dataframe + dd.from_pandas(pd.DataFrame({"a": range(10)})) + + # This gives us a cuDF-backed dataframe + dd.from_pandas(cudf.DataFrame({"a": range(10)})) + +An existing collection can always be moved to a specific backend +using the :func:`dask.dataframe.DataFrame.to_backend` API:: + + # This ensures that we have a cuDF-backed dataframe + df = df.to_backend("cudf") + + # This ensures that we have a pandas-backed dataframe + df = df.to_backend("pandas") + +The explicit Dask cuDF API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to providing the ``"cudf"`` backend for Dask DataFrame, +Dask cuDF also provides an explicit ``dask_cudf`` API:: + + import dask_cudf + + # This always gives us a cuDF-backed dataframe + df = dask_cudf.read_parquet("data.parquet", ...) + +This API is used implicitly by the Dask DataFrame API when the ``"cudf"`` +backend is enabled. Therefore, using it directly will not provide any +performance benefit over the CPU/GPU-portable ``dask.dataframe`` API. +Also, using some parts of the explicit API are incompatible with +automatic query planning (see the next section). + +The explicit Dask cuDF API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). +As long as the ``"dataframe.query-planning"`` configuration is set to +``True`` (the default) when ``dask.dataframe`` is first imported, `Dask +Expressions `__ will be used under the hood. + +For example, the following code will automatically benefit from predicate +pushdown when the result is computed:: + + df = dd.read_parquet("/my/parquet/dataset/") + result = df.sort_values('B')['A'] + +Unoptimized expression graph (``df.pprint()``):: + + Projection: columns='A' + SortValues: by=['B'] shuffle_method='tasks' options={} + ReadParquetFSSpec: path='/my/parquet/dataset/' ... + +Simplified expression graph (``df.simplify().pprint()``):: + + Projection: columns='A' + SortValues: by=['B'] shuffle_method='tasks' options={} + ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ... + +.. note:: + Dask will automatically simplify the expression graph (within + :func:`optimize`) when the result is converted to a task graph + (via :func:`compute` or :func:`persist`). You do not need to call + :func:`simplify` yourself. + + +Using Multiple GPUs and Multiple Nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try +to partition your data into small-enough tasks to fit comfortably in the +memory of a single GPU. This means the necessary compute tasks needed to +compute a query can often be streamed to a single GPU process for +out-of-core computing. This also means that the compute tasks can be +executed in parallel over a multi-GPU cluster. + +In order to execute your Dask workflow on multiple GPUs, you will +typically need to use `Dask-CUDA `__ +to deploy distributed Dask cluster, and +`Distributed `__ +to define a client object. For example:: + + from dask_cuda import LocalCUDACluster + from distributed import Client + + if __name__ == "__main__": + + client = Client( + LocalCUDACluster( + CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) + rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations + enable_cudf_spill=True, # Improve device memory stability + local_directory="/fast/scratch/", # Use fast local storage for spilling + ) + ) + + df = dd.read_parquet("/my/parquet/dataset/") + agg = df.groupby('B').sum() + agg.compute() # This will use the cluster defined above + +.. note:: + This example uses :func:`compute` to materialize a concrete + ``cudf.DataFrame`` object in local memory. Never call :func:`compute` + on a large collection that cannot fit comfortably in the memory of a + single GPU! See Dask's `documentation on managing computation + `__ + for more details. -Dataframe creation from in-memory formats -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Please see the `Dask-CUDA `__ +documentation for more information about deploying GPU-aware clusters +(including `best practices +`__). -If you already have a dataframe in memory and want to convert it to a -cuDF-backend one, there are two options depending on whether the -dataframe is already a Dask one or not. If you have a Dask dataframe, -then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"`` -as the backend; if you have a pandas dataframe then you can either -call :func:`dask.dataframe.from_pandas` followed by -:func:`~dask.dataframe.to_backend` or first convert the dataframe with -:func:`cudf.from_pandas` and then parallelise this with -:func:`dask_cudf.from_cudf`. API Reference ------------- -Generally speaking, Dask-cuDF tries to offer exactly the same API as -Dask itself. There are, however, some minor differences mostly because +Generally speaking, Dask cuDF tries to offer exactly the same API as +Dask DataFrame. There are, however, some minor differences mostly because cuDF does not :doc:`perfectly mirror ` the pandas API, or because cuDF provides additional configuration flags (these mostly occur in data reading and writing interfaces). @@ -97,7 +205,7 @@ flags (these mostly occur in data reading and writing interfaces). As a result, straightforward workflows can be migrated without too much trouble, but more complex ones that utilise more features may need a bit of tweaking. The API documentation describes details of the -differences and all functionality that Dask-cuDF supports. +differences and all functionality that Dask cuDF supports. .. toctree:: :maxdepth: 2 diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 5a0fbd224ad..6a0f0f6f169 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount, od, vd, nullCount, numRows, childHandles); } - static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { + /** + * Creates a ColumnVector from a native column_view using a contiguous device allocation. + * + * @param columnViewAddress address of the native column_view + * @param buffer device buffer containing the data referenced by the column view + */ + public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { return new ColumnVector(columnViewAddress, buffer); } diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index b37d0d88ec9..c8308ca17ec 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; + private final boolean strictValidation; + private final boolean allowLeadingZeros; + private final boolean allowNonNumericNumbers; + private final boolean allowUnquotedControlChars; private JSONOptions(Builder builder) { super(builder); @@ -44,6 +48,10 @@ private JSONOptions(Builder builder) { normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; + strictValidation = builder.strictValidation; + allowLeadingZeros = builder.allowLeadingZeros; + allowNonNumericNumbers = builder.allowNonNumericNumbers; + allowUnquotedControlChars = builder.allowUnquotedControlChars; } public boolean isDayFirst() { @@ -75,6 +83,22 @@ public boolean keepStringQuotes() { return keepStringQuotes; } + public boolean strictValidation() { + return strictValidation; + } + + public boolean leadingZerosAllowed() { + return allowLeadingZeros; + } + + public boolean nonNumericNumbersAllowed() { + return allowNonNumericNumbers; + } + + public boolean unquotedControlChars() { + return allowUnquotedControlChars; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -85,6 +109,10 @@ public static Builder builder() { } public static final class Builder extends ColumnFilterOptions.Builder { + private boolean strictValidation = false; + private boolean allowUnquotedControlChars = true; + private boolean allowNonNumericNumbers = false; + private boolean allowLeadingZeros = false; private boolean dayFirst = false; private boolean lines = true; @@ -95,10 +123,45 @@ public static final class Builder extends ColumnFilterOptions.Builder { + private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group + private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP; public Builder() { super(); } + public Builder withRowGroupSizeRows(int rowGroupSizeRows) { + this.rowGroupSizeRows = rowGroupSizeRows; + return this; + } + + public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) { + this.rowGroupSizeBytes = rowGroupSizeBytes; + return this; + } + public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) { this.statsGranularity = statsGranularity; return this; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 36e342cae13..09da43374ae 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -254,7 +254,11 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean normalizeSingleQuotes, boolean normalizeWhitespace, boolean mixedTypesAsStrings, - boolean keepStringQuotes) throws CudfException; + boolean keepStringQuotes, + boolean strictValidation, + boolean allowLeadingZeros, + boolean allowNonNumericNumbers, + boolean allowUnquotedControl) throws CudfException; private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames, int[] dTypeIds, int[] dTypeScales, @@ -264,6 +268,10 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean normalizeWhitespace, boolean mixedTypesAsStrings, boolean keepStringQuotes, + boolean strictValidation, + boolean allowLeadingZeros, + boolean allowNonNumericNumbers, + boolean allowUnquotedControl, long dsHandle) throws CudfException; private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines, @@ -272,7 +280,12 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool boolean normalizeWhitespace, boolean mixedTypesAsStrings, boolean keepStringQuotes, + boolean strictValidation, + boolean allowLeadingZeros, + boolean allowNonNumericNumbers, + boolean allowUnquotedControl, long dsHandle) throws CudfException; + private static native long readAndInferJSON(long address, long length, boolean dayFirst, boolean lines, @@ -280,7 +293,11 @@ private static native long readAndInferJSON(long address, long length, boolean normalizeSingleQuotes, boolean normalizeWhitespace, boolean mixedTypesAsStrings, - boolean keepStringQuotes) throws CudfException; + boolean keepStringQuotes, + boolean strictValidation, + boolean allowLeadingZeros, + boolean allowNonNumericNumbers, + boolean allowUnquotedControl) throws CudfException; /** * Read in Parquet formatted data. @@ -315,20 +332,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames, /** * Setup everything to write parquet formatted data to a file. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param filename local output path + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param filename local output path * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetFileBegin(String[] columnNames, @@ -338,6 +357,8 @@ private static native long writeParquetFileBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -349,20 +370,22 @@ private static native long writeParquetFileBegin(String[] columnNames, /** * Setup everything to write parquet formatted data to a buffer. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param consumer consumer of host buffers produced. + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetBufferBegin(String[] columnNames, @@ -372,6 +395,8 @@ private static native long writeParquetBufferBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -1292,7 +1317,11 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.isNormalizeSingleQuotes(), opts.isNormalizeWhitespace(), opts.isMixedTypesAsStrings(), - opts.keepStringQuotes()))) { + opts.keepStringQuotes(), + opts.strictValidation(), + opts.leadingZerosAllowed(), + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars()))) { return gatherJSONColumns(schema, twm, -1); } @@ -1370,7 +1399,12 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), opts.isNormalizeWhitespace(), - opts.isMixedTypesAsStrings(), opts.keepStringQuotes())); + opts.isMixedTypesAsStrings(), + opts.keepStringQuotes(), + opts.strictValidation(), + opts.leadingZerosAllowed(), + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars())); } /** @@ -1388,6 +1422,10 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { opts.isNormalizeWhitespace(), opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), + opts.strictValidation(), + opts.leadingZerosAllowed(), + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars(), dsHandle)); return twm; } finally { @@ -1430,10 +1468,18 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, - buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), + buffer.getAddress() + offset, len, + opts.isDayFirst(), + opts.isLines(), + opts.isRecoverWithNull(), + opts.isNormalizeSingleQuotes(), opts.isNormalizeWhitespace(), - opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) { + opts.isMixedTypesAsStrings(), + opts.keepStringQuotes(), + opts.strictValidation(), + opts.leadingZerosAllowed(), + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars()))) { return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1454,17 +1500,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param ds the DataSource to read from. - * @param emtpyRowCount the number of rows to return if no columns were read. + * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ - public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) { + public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), - schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), - opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), + schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), + opts.isDayFirst(), + opts.isLines(), + opts.isRecoverWithNull(), + opts.isNormalizeSingleQuotes(), opts.isNormalizeWhitespace(), - opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) { - return gatherJSONColumns(schema, twm, emtpyRowCount); + opts.isMixedTypesAsStrings(), + opts.keepStringQuotes(), + opts.strictValidation(), + opts.leadingZerosAllowed(), + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars(), + dsHandle))) { + return gatherJSONColumns(schema, twm, emptyRowCount); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } @@ -1773,6 +1828,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), @@ -1793,6 +1850,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c5abf08a59d..92e213bcb60 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1623,6 +1623,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1642,8 +1646,13 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .keep_quotes(keep_quotes); - + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1652,17 +1661,22 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, - jclass, - jlong buffer, - jlong buffer_length, - jboolean day_first, - jboolean lines, - jboolean recover_with_null, - jboolean normalize_single_quotes, - jboolean normalize_whitespace, - jboolean mixed_types_as_string, - jboolean keep_quotes) +JNIEXPORT jlong JNICALL +Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, + jclass, + jlong buffer, + jlong buffer_length, + jboolean day_first, + jboolean lines, + jboolean recover_with_null, + jboolean normalize_single_quotes, + jboolean normalize_whitespace, + jboolean mixed_types_as_string, + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1684,8 +1698,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) + .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1790,6 +1810,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1824,7 +1848,13 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .keep_quotes(keep_quotes); + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { @@ -1874,7 +1904,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { bool read_buffer = true; if (buffer == 0) { @@ -1923,7 +1957,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .keep_quotes(keep_quotes); + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { @@ -2110,6 +2150,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2165,6 +2207,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) @@ -2187,6 +2231,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2240,6 +2286,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 050bcbb268f..830f2b33b32 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException { } } + @Test void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { } } + private static final byte[] JSON_VALIDATION_BUFFER = ( + "{\"a\":true}\n" + + "{\"a\":false}\n" + + "{\"a\":null}\n" + + "{\"a\":true, \"b\":truee}\n" + + "{\"a\":true, \"b\":\"nulll\"}\n" + + "{\"a\": 1}\n" + + "{\"a\": 0}\n" + + "{\"a\": -}\n" + + "{\"a\": -0}\n" + + "{\"a\": -01}\n" + + + "{\"a\": 01}\n" + + "{\"a\": -0.1}\n" + + "{\"a\": -00.1}\n" + + "{\"a\": NaN}\n" + + "{\"a\": INF}\n" + + "{\"a\": +INF}\n" + + "{\"a\": -INF}\n" + + "{\"a\": +Infinity}\n" + + "{\"a\": Infinity}\n" + + "{\"a\": -Infinity}\n" + + + "{\"a\": INFinity}\n" + + "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" + + "{\"a\":12.}\n" + + "{\"a\": -3.4e+38}\n" + + "{\"a\": -3.4e-38}\n" + + "{\"a\": 1.4e38}\n" + + "{\"a\": -3.4E+38}\n" + + "{\"a\": -3.4E-38}\n" + + "{\"a\": 1.4E38}\n" + + "{\"a\": -3.4E+}\n" + + + "{\"a\": -3.4E-}\n" + + "{\"a\": \"A\u0000B\"}\n" + + "{\"a\": \"A\\u0000B\"}\n" + + "{\"a\": \"A\u0001B\"}\n" + + "{\"a\": \"A\\u0001B\"}\n" + + "{\"a\": \"A\u001FB\"}\n" + + "{\"a\": \"A\\u001FB\"}\n" + + "{\"a\": \"A\u0020B\"}\n" + + "{\"a\": \"A\\u0020B\"}\n" + + "{\"a\": \"\\u12\"}\n" + + + "{\"a\": \"\\z\"}\n" + + "{\"a\": \"\\r\"}\n" + + "{\"a\": \"something\", \"b\": \"\\z\"}\n" + ).getBytes(StandardCharsets.UTF_8); + + @Test + void testJSONValidationNoStrict() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(false) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", + "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", + "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", "\"something\"") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidation() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationLeadingZeros() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(true) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", "-01", + "01", "-0.1", "-00.1", null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationNonNumeric() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(true) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationUnquotedControl() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(false) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + @@ -8921,7 +9122,11 @@ void testParquetWriteToBufferChunked() { columns.add(Columns.STRUCT.name); WriteUtils.buildWriterOptions(optBuilder, columns); ParquetWriterOptions options = optBuilder.build(); - ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build(); + ParquetWriterOptions optionsNoCompress = + optBuilder.withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) + .build(); try (Table table0 = getExpectedFileTable(columns); MyBufferConsumer consumer = new MyBufferConsumer()) { try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { @@ -9007,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException { .withDecimalColumn("_c7", 4) .withDecimalColumn("_c8", 6) .withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE) .build(); try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index d7da42a1708..99b759e2166 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -46,7 +46,7 @@ ListDtype, StructDtype, ) -from cudf.core.groupby import Grouper +from cudf.core.groupby import Grouper, NamedAgg from cudf.core.index import ( BaseIndex, CategoricalIndex, diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5ee15d0e409..59cb8d51440 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, ) from pylibcudf.libcudf.types cimport size_type @@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width): ) return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash64(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash64( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 47a194c4fda..4bf8a9b1a8f 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, @@ -6,7 +6,12 @@ hash_character_ngrams, ) from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import minhash, minhash64 +from cudf._lib.nvtext.minhash import ( + minhash, + minhash64, + word_minhash, + word_minhash64, +) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces from cudf._lib.nvtext.replace import filter_tokens, replace_tokens diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 16e6908f308..e059917b0b8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5349,6 +5349,76 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint32. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + >>> ls.str.word_minhash(seeds=seeds) + 0 [21141582, 1232889953, 1268336794] + 1 [962346254, 2321233602, 1354839212] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash(self._column, seeds_column) + ) + + def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + This function generates 2 uint64 values but only the first + uint64 value is used. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint64. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + >>> ls.str.word_minhash64(seeds) + 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] + 1 [5240044617220523711, 5847101123925041457, 153762819128779913] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint64, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash64(self._column, seeds_column) + ) + def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ Compute the Jaccard index between this column and the given diff --git a/python/cudf/cudf/core/groupby/__init__.py b/python/cudf/cudf/core/groupby/__init__.py index 4375ed3e3da..621edb316cf 100644 --- a/python/cudf/cudf/core/groupby/__init__.py +++ b/python/cudf/cudf/core/groupby/__init__.py @@ -1,8 +1,9 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.groupby.groupby import GroupBy, Grouper +from cudf.core.groupby.groupby import GroupBy, Grouper, NamedAgg __all__ = [ "GroupBy", "Grouper", + "NamedAgg", ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4f283d41b17..6424c8af877 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -76,6 +76,34 @@ def _is_row_of(chunk, obj): ) +NamedAgg = pd.NamedAgg + + +NamedAgg.__doc__ = """ +Helper for column specific aggregation with control over output column names. + +Subclass of typing.NamedTuple. + +Parameters +---------- +column : Hashable + Column label in the DataFrame to apply aggfunc. +aggfunc : function or str + Function to apply to the provided column. + +Examples +-------- +>>> df = cudf.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) +>>> agg_a = cudf.NamedAgg(column="a", aggfunc="min") +>>> agg_1 = cudf.NamedAgg(column=1, aggfunc=lambda x: x.mean()) +>>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) + result_a result_1 +key +1 -1 10.5 +2 1 12.0 +""" + + groupby_doc_template = textwrap.dedent( """Group using a mapper or by a Series of columns. @@ -1296,9 +1324,21 @@ def _normalize_aggs( columns = values._columns aggs_per_column = (aggs,) * len(columns) elif not aggs and kwargs: - column_names, aggs_per_column = kwargs.keys(), kwargs.values() - columns = tuple(self.obj._data[x[0]] for x in kwargs.values()) - aggs_per_column = tuple(x[1] for x in kwargs.values()) + column_names = kwargs.keys() + + def _raise_invalid_type(x): + raise TypeError( + f"Invalid keyword argument {x} of type {type(x)} was passed to agg" + ) + + columns, aggs_per_column = zip( + *( + (self.obj._data[x[0]], x[1]) + if isinstance(x, tuple) + else _raise_invalid_type(x) + for x in kwargs.values() + ) + ) else: raise TypeError("Must provide at least one aggregation function.") diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3d205957126..c026579b8b5 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -738,7 +738,8 @@ def get_dummies( sparse : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. drop_first : boolean, optional - Right now this is NON-FUNCTIONAL argument in rapids. + Whether to get k-1 dummies out of k categorical levels by removing the + first level. columns : sequence of str, optional Names of columns to encode. If not provided, will attempt to encode all columns. Note this is different from pandas default behavior, which @@ -806,9 +807,6 @@ def get_dummies( if sparse: raise NotImplementedError("sparse is not supported yet") - if drop_first: - raise NotImplementedError("drop_first is not supported yet") - if isinstance(data, cudf.DataFrame): encode_fallback_dtypes = ["object", "category"] @@ -862,6 +860,7 @@ def get_dummies( prefix=prefix_map.get(name, prefix), prefix_sep=prefix_sep_map.get(name, prefix_sep), dtype=dtype, + drop_first=drop_first, ) result_data.update(col_enc_data) return cudf.DataFrame._from_data(result_data, index=data.index) @@ -874,6 +873,7 @@ def get_dummies( prefix=prefix, prefix_sep=prefix_sep, dtype=dtype, + drop_first=drop_first, ) return cudf.DataFrame._from_data(data, index=ser.index) @@ -1256,6 +1256,7 @@ def _one_hot_encode_column( prefix: str | None, prefix_sep: str | None, dtype: Dtype | None, + drop_first: bool, ) -> dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with @@ -1276,6 +1277,8 @@ def _one_hot_encode_column( ) data = one_hot_encode(column, categories) + if drop_first and len(data): + data.pop(next(iter(data))) if prefix is not None and prefix_sep is not None: data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()} if dtype: diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index 3a82829eb7a..e0d3d9101a9 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -10,6 +10,7 @@ """ import argparse +import code import runpy import sys import tempfile @@ -21,6 +22,8 @@ @contextmanager def profile(function_profile, line_profile, fn): + if fn is None and (line_profile or function_profile): + raise RuntimeError("Enabling the profiler requires a script name.") if line_profile: with open(fn) as f: lines = f.readlines() @@ -54,6 +57,11 @@ def main(): dest="module", nargs=1, ) + parser.add_argument( + "-c", + dest="cmd", + nargs=1, + ) parser.add_argument( "--profile", action="store_true", @@ -72,9 +80,18 @@ def main(): args = parser.parse_args() + if args.cmd: + f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py") + f.write(args.cmd[0].encode()) + f.seek(0) + args.args.insert(0, f.name) + install() - with profile(args.profile, args.line_profile, args.args[0]) as fn: - args.args[0] = fn + + script_name = args.args[0] if len(args.args) > 0 else None + with profile(args.profile, args.line_profile, script_name) as fn: + if script_name is not None: + args.args[0] = fn if args.module: (module,) = args.module # run the module passing the remaining arguments @@ -85,6 +102,21 @@ def main(): # Remove ourself from argv and continue sys.argv[:] = args.args runpy.run_path(args.args[0], run_name="__main__") + else: + if sys.stdin.isatty(): + banner = f"Python {sys.version} on {sys.platform}" + site_import = not sys.flags.no_site + if site_import: + cprt = 'Type "help", "copyright", "credits" or "license" for more information.' + banner += "\n" + cprt + else: + # Don't show prompts or banners if stdin is not a TTY + sys.ps1 = "" + sys.ps2 = "" + banner = "" + + # Launch an interactive interpreter + code.interact(banner=banner, exitmsg="") if __name__ == "__main__": diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index 99e7523031b..dc20a27177a 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -56,3 +56,19 @@ def test_dataframe_agg(attr, func): ) assert_eq(agg, pd_agg) + + agg = getattr(df.groupby("a"), attr)( + foo=cudf.NamedAgg(column="b", aggfunc=func), + bar=cudf.NamedAgg(column="a", aggfunc=func), + ) + pd_agg = getattr(pdf.groupby(["a"]), attr)( + foo=("b", func), bar=("a", func) + ) + + assert_eq(agg, pd_agg) + + +def test_dataframe_agg_with_invalid_kwarg(): + with pytest.raises(TypeError, match="Invalid keyword argument"): + df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) + df.groupby("a").agg(foo=set()) diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index cc17dc46e0a..e054143b438 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated(): df = cudf.DataFrame(range(3)) with pytest.warns(FutureWarning): cudf.get_dummies(df, cats={0: [0, 1, 2]}) + + +def test_get_dummies_drop_first_series(): + result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True) + expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) + assert_eq(result, expected) + + +def test_get_dummies_drop_first_dataframe(): + result = cudf.get_dummies( + cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), + drop_first=True, + ) + expected = pd.get_dummies( + pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True + ) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8b59a7eef08..7f1b0b1cd46 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema): df1 = cudf.DataFrame( { "i32": cudf.Series([None, None, None], dtype="int32"), - "i64": cudf.Series([1234, None, 123], dtype="int64"), - "list": list([[1, 2], [None, 4], [5, 6]]), + "i64": cudf.Series([1234, 467, 123], dtype="int64"), + "list": list([[1, 2], None, [None, 6]]), "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), "str": ["vfd", None, "ghu"], "d_list": list( @@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema): df2 = cudf.DataFrame( { - "str": ["abc", "def", None], + "str": ["abc", "def", "ghi"], "i64": cudf.Series([None, 65, 98], dtype="int64"), "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"), - "list": list([[7, 8], [9, 10], [None, 12]]), + "list": list([[7, 8], [9, 10], [11, 12]]), "d_list": list( [ [pd.Timedelta(minutes=4), None], - [None, None], + None, [pd.Timedelta(minutes=6), None], ] ), @@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs(): { "a": 1, "b": { - "inner_a": 10, - "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2}, + "a_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 2}, }, "c": 2, }, { "a": 3, - "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}}, + "b": {"b_a": 30, "b_b": {"b_b_a": 210}}, "c": 4, }, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6}, {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - { - "a": None, - "b": { - "inner_a": None, - "inner_b": {"inner_inner_b": None, "inner_inner_a": 10}, - }, - "c": 10, - }, + {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None}, ] data2 = [ - {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}}, - {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}}, - {"a": 5, "b": {"inner_b": None}}, - {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}}, - {"a": None, "b": {"inner_b": None}}, + {"a": 1, "b": {"b_b": {"b_b_a": None}}}, + {"a": 5, "b": {"b_b": None}}, + {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}}, + {"a": None, "b": {"b_b": None}}, None, - {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}}, ] # cuDF tables from struct data @@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs(): # Read the struct.b.inner_b.inner_inner_a column from parquet got = cudf.read_parquet( [buf1, buf2], - columns=["struct.b.inner_b.inner_inner_a"], + columns=["struct.b.b_b.b_b_a"], allow_mismatched_pq_schemas=True, ) got = ( cudf.Series(got["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a") + .struct.field("b_b") + .struct.field("b_b_a") ) # Read with chunked reader got_chunked = read_parquet_chunked( [buf1, buf2], - columns=["struct.b.inner_b.inner_inner_a"], + columns=["struct.b.b_b.b_b_a"], chunk_read_limit=240, pass_read_limit=240, allow_mismatched_pq_schemas=True, @@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs(): got_chunked = ( cudf.Series(got_chunked["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a") + .struct.field("b_b") + .struct.field("b_b_a") ) # Construct the expected series @@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs(): [ cudf.Series(df1["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a"), + .struct.field("b_b") + .struct.field("b_b_a"), cudf.Series(df2["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a"), + .struct.field("b_b") + .struct.field("b_b_a"), ] ).reset_index(drop=True) @@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error(): ) data1 = [ - {"a": 1, "b": {"inner_a": 1, "inner_b": 6}}, - {"a": 3, "b": {"inner_a": None, "inner_b": 2}}, + {"a": 1, "b": {"b_a": 1, "b_b": 6}}, + {"a": 3, "b": {"b_a": None, "b_b": 2}}, ] data2 = [ - {"b": {"inner_a": 1}, "c": "str"}, - {"b": {"inner_a": None}, "c": None}, + {"b": {"b_a": 1}, "c": "str"}, + {"b": {"b_a": None}, "c": None}, ] # cuDF tables from struct data @@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error(): ): cudf.read_parquet( [buf1, buf2], - columns=["struct.b.inner_b"], + columns=["struct.b.b_b"], allow_mismatched_pq_schemas=True, ) + + +def test_parquet_reader_mismatched_nullability(): + # Ensure that we can faithfully read the tables with mismatched nullabilities + df1 = cudf.DataFrame( + { + "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + None, + [pd.Timedelta(minutes=8), None], + ], + None, + ], + None, + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, None, 4123], dtype="int64"), + "int32": cudf.Series([1234, 123, 4123], dtype="int32"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "string": cudf.Series(["kitten", "puppy", "cub"]), + } + ) + + df2 = cudf.DataFrame( + { + "timedelta": cudf.Series( + [None, None, None], dtype="timedelta64[ms]" + ), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)], + ], + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, 123, 4123], dtype="int64"), + "int32": cudf.Series([1234, None, 4123], dtype="int32"), + "list": list([[1, 2], None, [1, 2]]), + "datetime": cudf.Series( + [1234, None, 4123], dtype="datetime64[ms]" + ), + "string": cudf.Series(["kitten", None, "cub"]), + } + ) + + # Write tables to parquet with arrow schema for compatibility for duration column(s) + fname1 = BytesIO() + df1.to_parquet(fname1, store_schema=True) + fname2 = BytesIO() + df2.to_parquet(fname2, store_schema=True) + + # Read tables back with cudf and arrow in either order and compare + assert_eq( + cudf.read_parquet([fname1, fname2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([fname2, fname1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) + + +def test_parquet_reader_mismatched_nullability_structs(tmpdir): + data1 = [ + { + "a": "a", + "b": { + "b_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 12}, + }, + "c": [1, 2], + }, + { + "a": "b", + "b": { + "b_a": 30, + "b_b": {"b_b_b": 2, "b_b_a": 2}, + }, + "c": [3, 4], + }, + { + "a": "c", + "b": { + "b_a": 50, + "b_b": {"b_b_b": 4, "b_b_a": 5}, + }, + "c": [5, 6], + }, + { + "a": "d", + "b": { + "b_a": 135, + "b_b": {"b_b_b": 12, "b_b_a": 32}, + }, + "c": [7, 8], + }, + { + "a": "e", + "b": { + "b_a": 1, + "b_b": {"b_b_b": 1, "b_b_a": 5}, + }, + "c": [9, 10], + }, + { + "a": "f", + "b": { + "b_a": 32, + "b_b": {"b_b_b": 1, "b_b_a": 6}, + }, + "c": [11, 12], + }, + ] + + data2 = [ + { + "a": "g", + "b": { + "b_a": 10, + "b_b": {"b_b_b": None, "b_b_a": 2}, + }, + "c": None, + }, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, + {"a": "j", "b": None, "c": [8, 10]}, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, + None, + { + "a": None, + "b": {"b_a": None, "b_b": {"b_b_b": 1}}, + "c": [18, 19], + }, + {"a": None, "b": None, "c": None}, + ] + + pa_table1 = pa.Table.from_pydict({"struct": data1}) + df1 = cudf.DataFrame.from_arrow(pa_table1) + + pa_table2 = pa.Table.from_pydict({"struct": data2}) + df2 = cudf.DataFrame.from_arrow(pa_table2) + + # Write tables to parquet + buf1 = BytesIO() + df1.to_parquet(buf1) + buf2 = BytesIO() + df2.to_parquet(buf2) + + # Read tables back with cudf and compare with expected. + assert_eq( + cudf.read_parquet([buf1, buf2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([buf2, buf1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 52179f55da3..997ca357986 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -946,6 +946,66 @@ def test_minhash(): strings.str.minhash64(seeds=seeds) +def test_word_minhash(): + ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + + expected = cudf.Series( + [ + cudf.Series([21141582], dtype=np.uint32), + cudf.Series([962346254], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), + cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash(seeds=seeds) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + cudf.Series([2603139454418834912], dtype=np.uint64), + cudf.Series([5240044617220523711], dtype=np.uint64), + ] + ) + actual = ls.str.word_minhash64() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [ + 2603139454418834912, + 8644371945174847701, + 5541030711534384340, + ], + dtype=np.uint64, + ), + cudf.Series( + [5240044617220523711, 5847101123925041457, 153762819128779913], + dtype=np.uint64, + ), + ] + ) + actual = ls.str.word_minhash64(seeds=seeds) + assert_eq(expected, actual) + + # test wrong seed types + with pytest.raises(ValueError): + ls.str.word_minhash(seeds="a") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + ls.str.word_minhash(seeds=seeds) + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + ls.str.word_minhash64(seeds=seeds) + + def test_jaccard_index(): str1 = cudf.Series(["the brown dog", "jumped about"]) str2 = cudf.Series(["the black cat", "jumped around"]) diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py new file mode 100644 index 00000000000..326224c8fc0 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_main.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import tempfile +import textwrap + + +def _run_python(*, cudf_pandas, command): + executable = "python " + if cudf_pandas: + executable += "-m cudf.pandas " + return subprocess.run( + executable + command, + shell=True, + capture_output=True, + check=True, + text=True, + ) + + +def test_run_cudf_pandas_with_script(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f: + code = textwrap.dedent( + """ + import pandas as pd + df = pd.DataFrame({'a': [1, 2, 3]}) + print(df['a'].sum()) + """ + ) + f.write(code) + f.flush() + + res = _run_python(cudf_pandas=True, command=f.name) + expect = _run_python(cudf_pandas=False, command=f.name) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args(): + input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf(): + """Verify that cudf is active with -m cudf.pandas.""" + input_args_and_code = """-c 'import pandas as pd; print(pd)'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert "cudf" in res.stdout + assert "cudf" not in expect.stdout + + +def test_cudf_pandas_script_repl(): + def start_repl_process(cmd): + return subprocess.Popen( + cmd.split(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + ) + + def get_repl_output(process, commands): + for command in commands: + process.stdin.write(command) + process.stdin.flush() + return process.communicate() + + p1 = start_repl_process("python -m cudf.pandas") + p2 = start_repl_process("python") + commands = [ + "import pandas as pd\n", + "print(pd.Series(range(2)).sum())\n", + "print(pd.Series(range(5)).sum())\n", + "import sys\n", + "print(pd.Series(list('abcd')), out=sys.stderr)\n", + ] + + res = get_repl_output(p1, commands) + expect = get_repl_output(p2, commands) + + # Check stdout + assert res[0] != "" + assert res[0] == expect[0] + + # Check stderr + assert res[1] != "" + assert res[1] == expect[1] + + p1.kill() + p2.kill() diff --git a/python/custreamz/README.md b/python/custreamz/README.md index 1509dac9e61..8da17ef09dc 100644 --- a/python/custreamz/README.md +++ b/python/custreamz/README.md @@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids ### Conda -cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel: +cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: Release: ```bash diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md index 6edb9f87d48..4655d2165f0 100644 --- a/python/dask_cudf/README.md +++ b/python/dask_cudf/README.md @@ -1,135 +1,63 @@ #
 Dask cuDF - A GPU Backend for Dask DataFrame
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame. - -## Using Dask cuDF - -### The Dask DataFrame API (Recommended) - -Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically: - -```python -import dask -dask.config.set({"dataframe.backend": "cudf"}) - -import dask.dataframe as dd -# This gives us a cuDF-backed dataframe -df = dd.read_parquet("data.parquet", ...) -``` +Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame. > [!IMPORTANT] -> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function: - -```python -import pandas as pd -import cudf - -# This gives us a Pandas-backed dataframe -dd.from_pandas(pd.DataFrame({"a": range(10)})) - -# This gives us a cuDF-backed dataframe -dd.from_pandas(cudf.DataFrame({"a": range(10)})) -``` - -A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend: - -```python -df = df.to_backend("pandas") -``` - -Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend: - -```python -df = df.to_backend("cudf") -``` - -### The Explicit Dask cuDF API - -In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API: - -```python -import dask_cudf - -# This always gives us a cuDF-backed dataframe -df = dask_cudf.read_parquet("data.parquet", ...) -``` - -> [!NOTE] -> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section). +> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently. -See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information. - -## Query Planning - -Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood. - -For example, the following user code will automatically benefit from predicate pushdown when the result is computed. - -```python -df = dd.read_parquet("/my/parquet/dataset/") -result = df.sort_values('B')['A'] -``` - -Unoptimized expression graph (`df.pprint()`): -``` -Projection: columns='A' - SortValues: by=['B'] shuffle_method='tasks' options={} - ReadParquetFSSpec: path='/my/parquet/dataset/' ... -``` +## Using Dask cuDF -Simplified expression graph (`df.simplify().pprint()`): -``` -Projection: columns='A' - SortValues: by=['B'] shuffle_method='tasks' options={} - ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ... -``` +Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF. -> [!NOTE] -> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself. +## Installation +See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages. -## Using Multiple GPUs and Multiple Nodes +## Resources -Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster. +- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/) +- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/) +- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) +- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) +- [Deployment](https://docs.rapids.ai/deployment/stable/) +- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate. -> [!IMPORTANT] -> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs. +### Quick-start example -In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example: +A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern: ```python - +import dask +import dask.dataframe as dd from dask_cuda import LocalCUDACluster from distributed import Client -client = Client( +if __name__ == "__main__": + + # Define a GPU-aware cluster to leverage multiple GPUs + client = Client( LocalCUDACluster( - CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) - rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations - enable_cudf_spill=True, # Improve device memory stability - local_directory="/fast/scratch/", # Use fast local storage for spilling + CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) + rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations + enable_cudf_spill=True, # Improve device memory stability + local_directory="/fast/scratch/", # Use fast local storage for spilling ) -) + ) -df = dd.read_parquet("/my/parquet/dataset/") -agg = df.groupby('B').sum() -agg.compute() # This will use the cluster defined above -``` + # Set the default dataframe backend to "cudf" + dask.config.set({"dataframe.backend": "cudf"}) -> [!NOTE] -> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details. + # Create your DataFrame collection from on-disk + # or in-memory data + df = dd.read_parquet("/my/parquet/dataset/") -Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)). + # Use cudf-like syntax to transform and/or query your data + query = df.groupby('item')['price'].mean() -## Install - -See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages. + # Compute, persist, or write out the result + query.head() +``` -## Resources +If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory). -- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) -- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/) -- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) -- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) -- [Deployment](https://docs.rapids.ai/deployment/stable/) -- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate. +If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions. diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index f60e4ff81ef..97e1dffc65b 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs): return self.to_backend("pandas", **kwargs) + def _prepare_cov_corr(self, min_periods, numeric_only): + # Upstream version of this method sets min_periods + # to 2 by default (which is not supported by cudf) + # TODO: Remove when cudf supports both min_periods + # and numeric_only + # See: https://github.com/rapidsai/cudf/issues/12626 + # See: https://github.com/rapidsai/cudf/issues/9009 + self._meta.cov(min_periods=min_periods) + + frame = self + if numeric_only: + numerics = self._meta._get_numeric_data() + if len(numerics.columns) != len(self.columns): + frame = frame[list(numerics.columns)] + return frame, min_periods + # var can be removed if cudf#15179 is addressed. - # See: https://github.com/rapidsai/cudf/issues/15179 + # See: https://github.com/rapidsai/cudf/issues/14935 def var( self, axis=0, diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 905d8c08135..7aa0f6320f2 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1007,3 +1007,20 @@ def test_to_backend_simplify(): df2 = df.to_backend("cudf")[["y"]].simplify() df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify() assert df2._name == df3._name + + +@pytest.mark.parametrize("numeric_only", [True, False]) +@pytest.mark.parametrize("op", ["corr", "cov"]) +def test_cov_corr(op, numeric_only): + df = cudf.DataFrame.from_dict( + { + "x": np.random.randint(0, 5, size=10), + "y": np.random.normal(size=10), + } + ) + ddf = dd.from_pandas(df, npartitions=2) + res = getattr(ddf, op)(numeric_only=numeric_only) + # Use to_pandas until cudf supports numeric_only + # (See: https://github.com/rapidsai/cudf/issues/12626) + expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) + dd.assert_eq(res, expect) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 0c352a5068b..f2dd22f43aa 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const column_view &seeds, const size_type width, ) except + + + cdef unique_ptr[column] word_minhash( + const column_view &input, + const column_view &seeds + ) except + + + cdef unique_ptr[column] word_minhash64( + const column_view &input, + const column_view &seeds + ) except +