Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into cudf.pandas-repl
Browse files Browse the repository at this point in the history
  • Loading branch information
bdice authored Sep 17, 2024
2 parents 2a8fc75 + a112f68 commit 0c9e469
Show file tree
Hide file tree
Showing 738 changed files with 5,704 additions and 3,469 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ build/
cpp/build/
cpp/examples/*/install/
cpp/examples/*/build/
cpp/examples/tpch/datagen/datafusion
cpp/include/cudf/ipc_generated/*.h
cpp/thirdparty/googletest/

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12

### Conda

cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:

```bash
conda install -c rapidsai -c conda-forge -c nvidia \
Expand Down
2 changes: 1 addition & 1 deletion ci/cudf_pandas_scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ IFS=',' read -r -a versions <<< "$output"

for version in "${versions[@]}"; do
echo "Installing pandas version: ${version}"
python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}"
python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}.*"
python -m pytest -p cudf.pandas \
--ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \
--cov-config=./python/cudf/.coveragerc \
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ add_library(
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
src/io/json/parser_features.cpp
src/io/json/process_tokens.cu
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
28 changes: 18 additions & 10 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,25 @@ target_include_directories(
)

add_library(
tpch_data_generator STATIC
common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
common/tpch_data_generator/random_column_generator.cu
ndsh_data_generator STATIC
common/ndsh_data_generator/ndsh_data_generator.cpp common/ndsh_data_generator/table_helpers.cpp
common/ndsh_data_generator/random_column_generator.cu
)
target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
target_compile_features(ndsh_data_generator PUBLIC cxx_std_17 cuda_std_17)

target_compile_options(
tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
ndsh_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
)

target_link_libraries(
tpch_data_generator
ndsh_data_generator
PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
)

target_include_directories(
tpch_data_generator
ndsh_data_generator
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
)
Expand Down Expand Up @@ -127,8 +127,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::nvbench
$<TARGET_NAME_IF_EXISTS:conda_env>
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
)
install(
TARGETS ${CMAKE_BENCH_NAME}
Expand Down Expand Up @@ -175,6 +175,14 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
# * transpose benchmark ---------------------------------------------------------------------------
ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)

# ##################################################################################################
# * nds-h benchmark --------------------------------------------------------------------------------
ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)

# ##################################################################################################
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureNVBench(
Expand Down Expand Up @@ -329,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
)

# ##################################################################################################
Expand Down
14 changes: 7 additions & 7 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/device_buffer.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

#include <cuda/functional>
#include <thrust/binary_search.h>
Expand Down Expand Up @@ -507,7 +507,7 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
null_mask.end(),
thrust::identity<bool>{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
cudf::get_current_device_resource_ref());

return std::make_unique<cudf::column>(
dtype,
Expand Down Expand Up @@ -591,7 +591,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
null_mask.end() - 1,
thrust::identity<bool>{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
cudf::get_current_device_resource_ref());
return cudf::make_strings_column(
num_rows,
std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
Expand Down Expand Up @@ -626,7 +626,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
cudf::out_of_bounds_policy::DONT_CHECK,
cudf::detail::negative_index_policy::NOT_ALLOWED,
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
cudf::get_current_device_resource_ref());
return std::move(str_table->release()[0]);
}

Expand Down Expand Up @@ -688,7 +688,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
valids.end(),
thrust::identity<bool>{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
cudf::get_current_device_resource_ref());
}
return std::pair<rmm::device_buffer, cudf::size_type>{};
}();
Expand Down Expand Up @@ -782,7 +782,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
valids.end(),
thrust::identity<bool>{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
cudf::get_current_device_resource_ref());
list_column = cudf::make_lists_column(
current_num_rows,
std::move(offsets_column),
Expand Down Expand Up @@ -933,7 +933,7 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
thrust::make_counting_iterator<cudf::size_type>(size),
bool_generator{seed, 1.0 - *null_probability},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
cudf::get_current_device_resource_ref());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include "tpch_data_generator.hpp"
#include "ndsh_data_generator.hpp"

#include "random_column_generator.hpp"
#include "table_helpers.hpp"
Expand All @@ -36,6 +36,9 @@
#include <cudf/transform.hpp>
#include <cudf/unary.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/resource_ref.hpp>

#include <array>
#include <string>
#include <vector>
Expand Down Expand Up @@ -432,46 +435,37 @@ std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& o
columns.push_back(std::move(l_quantity));
columns.push_back(std::move(l_discount));
columns.push_back(std::move(l_tax));
columns.push_back(std::move(l_returnflag));
columns.push_back(std::move(l_linestatus));
columns.push_back(std::move(l_shipdate_ts));
columns.push_back(std::move(l_commitdate_ts));
columns.push_back(std::move(l_receiptdate_ts));
columns.push_back(std::move(l_returnflag));
columns.push_back(std::move(l_linestatus));
columns.push_back(std::move(l_shipinstruct));
columns.push_back(std::move(l_shipmode));
columns.push_back(std::move(l_comment));
return std::make_unique<cudf::table>(std::move(columns));
}

std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
/**
* @brief Generate the part of the `orders` table dependent on the `lineitem` table
*
* @param lineitem_partial The partially generated `lineitem` table
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem_partial,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto const l_linestatus_mask = lineitem.column(0);
auto const l_orderkey = lineitem.column(1);
auto const l_discount = lineitem.column(6);
auto const l_tax = lineitem.column(7);
auto const l_extendedprice = lineitem.column(16);
auto const l_linestatus_mask = lineitem_partial.column(0);
auto const l_orderkey = lineitem_partial.column(1);
auto const l_extendedprice = lineitem_partial.column(6);
auto const l_discount = lineitem_partial.column(7);
auto const l_tax = lineitem_partial.column(8);

std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;

// Generate the `o_totalprice` column
// We calculate the `charge` column, which is a function of `l_extendedprice`,
// `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
auto o_totalprice = [&]() {
auto const keys = cudf::table_view({l_orderkey});
cudf::groupby::groupby gb(keys);
std::vector<cudf::groupby::aggregation_request> requests;
requests.push_back(cudf::groupby::aggregation_request());
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
requests[0].values = l_charge->view();
auto agg_result = gb.aggregate(requests);
return cudf::round(agg_result.second[0].results[0]->view(), 2);
}();
orders_dependent_columns.push_back(std::move(o_totalprice));

// Generate the `o_orderstatus` column
auto o_orderstatus = [&]() {
auto const keys = cudf::table_view({l_orderkey});
Expand Down Expand Up @@ -526,6 +520,22 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
}();
orders_dependent_columns.push_back(std::move(o_orderstatus));

// Generate the `o_totalprice` column
// We calculate the `charge` column, which is a function of `l_extendedprice`,
// `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
auto o_totalprice = [&]() {
auto const keys = cudf::table_view({l_orderkey});
cudf::groupby::groupby gb(keys);
std::vector<cudf::groupby::aggregation_request> requests;
requests.push_back(cudf::groupby::aggregation_request());
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
requests[0].values = l_charge->view();
auto agg_result = gb.aggregate(requests);
return cudf::round(agg_result.second[0].results[0]->view(), 2);
}();
orders_dependent_columns.push_back(std::move(o_totalprice));
return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
}

Expand Down Expand Up @@ -727,9 +737,7 @@ generate_orders_lineitem_part(double scale_factor,
// Generate the `part` table
auto part = generate_part(scale_factor, stream, mr);

// Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
// add the column to the `lineitem` table, and write the `lineitem` table to a parquet file

// Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column
auto l_extendedprice = [&]() {
auto const left = cudf::table_view(
{lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
Expand All @@ -749,8 +757,9 @@ generate_orders_lineitem_part(double scale_factor,
return cudf::round(col->view(), 2);
}();

// Insert the `l_extendedprice` column into the partial columns of the `lineitem` table
auto lineitem_partial_columns = lineitem_partial->release();
lineitem_partial_columns.push_back(std::move(l_extendedprice));
lineitem_partial_columns.insert(lineitem_partial_columns.begin() + 6, std::move(l_extendedprice));
auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));

// Generate the dependent columns of the `orders` table
Expand All @@ -759,7 +768,7 @@ generate_orders_lineitem_part(double scale_factor,

auto orders_independent_columns = orders_independent->release();
auto orders_dependent_columns = orders_dependent->release();
orders_independent_columns.insert(orders_independent_columns.end(),
orders_independent_columns.insert(orders_independent_columns.begin() + 2,
std::make_move_iterator(orders_dependent_columns.begin()),
std::make_move_iterator(orders_dependent_columns.end()));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#pragma once

#include <cudf/table/table.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/memory_resource.hpp>

namespace CUDF_EXPORT cudf {
namespace datagen {
Expand All @@ -32,7 +34,7 @@ std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::uniq
generate_orders_lineitem_part(
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Generate the `partsupp` table
Expand All @@ -44,7 +46,7 @@ generate_orders_lineitem_part(
std::unique_ptr<cudf::table> generate_partsupp(
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Generate the `supplier` table
Expand All @@ -56,7 +58,7 @@ std::unique_ptr<cudf::table> generate_partsupp(
std::unique_ptr<cudf::table> generate_supplier(
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Generate the `customer` table
Expand All @@ -68,7 +70,7 @@ std::unique_ptr<cudf::table> generate_supplier(
std::unique_ptr<cudf::table> generate_customer(
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Generate the `nation` table
Expand All @@ -78,7 +80,7 @@ std::unique_ptr<cudf::table> generate_customer(
*/
std::unique_ptr<cudf::table> generate_nation(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Generate the `region` table
Expand All @@ -88,7 +90,7 @@ std::unique_ptr<cudf::table> generate_nation(
*/
std::unique_ptr<cudf::table> generate_region(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

} // namespace datagen
} // namespace CUDF_EXPORT cudf
Loading

0 comments on commit 0c9e469

Please sign in to comment.