Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into fea/cudf-polars/report-all-unsupport…
Browse files Browse the repository at this point in the history
…ed-ops
  • Loading branch information
Matt711 authored Oct 8, 2024
2 parents c1e2d37 + 2d02bdc commit e7ae673
Show file tree
Hide file tree
Showing 182 changed files with 2,870 additions and 1,238 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/pr_issue_status_automation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,33 @@ jobs:
UPDATE_ITEM: true
UPDATE_LINKED_ISSUES: true
secrets: inherit

process-branch-name:
if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
needs: get-project-id
runs-on: ubuntu-latest
outputs:
branch-name: ${{ steps.process-branch-name.outputs.branch-name }}
steps:
- name: Extract branch name
id: process-branch-name
run: |
branch=${{ github.event.pull_request.base.ref }}
release=${branch#branch-}
echo "branch-name=$release" >> "$GITHUB_OUTPUT"
update-release:
# This job sets the PR and its linked issues to the release they are targeting
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
needs: [get-project-id, process-branch-name]
with:
PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ"
SINGLE_SELECT_FIELD_NAME: "Release"
SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}"
ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}"
ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
UPDATE_ITEM: true
UPDATE_LINKED_ISSUES: true
secrets: inherit
15 changes: 9 additions & 6 deletions ci/test_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@ rapids-logger "Download wheels"
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist

# Download the pylibcudf built in the previous step
RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
# Download libcudf and pylibcudf built in the previous step
RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep
RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep

rapids-logger "Install pylibcudf"
python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
rapids-logger "Install libcudf, pylibcudf and cudf_polars"
python -m pip install \
-v \
"$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
"$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
"$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"

rapids-logger "Install cudf_polars"
python -m pip install $(echo ./dist/cudf_polars*.whl)

TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
rapids-logger "Clone polars to ${TAG}"
Expand Down
2 changes: 1 addition & 1 deletion ci/test_python_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf

rapids-logger "Check GPU usage"
nvidia-smi

rapids-print-env
EXITCODE=0
trap "EXITCODE=1" ERR
set +e
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ dependencies:
- openpyxl
- packaging
- pandas
- pandas>=2.0,<2.2.3dev0
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.8,<1.9
- pre-commit
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ dependencies:
- openpyxl
- packaging
- pandas
- pandas>=2.0,<2.2.3dev0
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.8,<1.9
- pre-commit
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ requirements:
run:
- python
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.3dev0
- pandas >=2.0,<2.2.4dev0
- cupy >=12.0.0
- numba-cuda >=0.0.13
- numpy >=1.23,<3.0a0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/pylibcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ requirements:
run:
- python
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.3dev0
- pandas >=2.0,<2.2.4dev0
- numpy >=1.23,<3.0a0
- pyarrow>=14.0.0,<18.0.0a0
- {{ pin_compatible('rmm', max_pin='x.x') }}
Expand Down
43 changes: 36 additions & 7 deletions cpp/.clang-tidy
Original file line number Diff line number Diff line change
@@ -1,18 +1,47 @@
---
# Notes on disabled checks
# ------------------------
# modernize-use-equals-default:
# auto-fix is broken (doesn't insert =default correctly)
# modernize-concat-nested-namespaces:
# auto-fix is broken (can delete code)
# modernize-use-trailing-return-type:
# Purely stylistic, no benefit to rewriting everything
# modernize-return-braced-init-list:
# Stylistically we prefer to see the return type at the return site.
# See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672
# for more information.
# modernize-use-bool-literals:
# Our tests use int flags for validity masks extensively and we prefer that
# clang-analyzer-cplusplus.NewDeleteLeaks:
# This check has numerous bugs, see
# https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks
# We encounter at least
# https://github.com/llvm/llvm-project/issues/60896
# https://github.com/llvm/llvm-project/issues/69602
# clang-analyzer-optin.core.EnumCastOutOfRange
# We use enums as flags in multiple cases and this check makes ORing flags invalid
# clang-analyzer-optin.cplusplus.UninitializedObject'
# There is an error in nanoarrow that none of the clang-tidy filters (i.e.
# header-filter and exclude-header-filter are able to properly avoid. This
# merits further investigation
#
# We need to verify that broken checks are still broken
Checks:
'modernize-*,
-modernize-use-equals-default,
-modernize-concat-nested-namespaces,
-modernize-use-trailing-return-type,
-modernize-use-bool-literals'

# -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly)
# -modernize-concat-nested-namespaces # auto-fix is broken (can delete code)
# -modernize-use-trailing-return-type # just a preference
-modernize-return-braced-init-list,
-modernize-use-bool-literals,
clang-analyzer-*,
-clang-analyzer-cplusplus.NewDeleteLeaks,
-clang-analyzer-optin.core.EnumCastOutOfRange,
-clang-analyzer-optin.cplusplus.UninitializedObject'

WarningsAsErrors: ''
HeaderFilterRegex: ''
AnalyzeTemporaryDtors: false
HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*'
ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
FormatStyle: none
CheckOptions:
- key: modernize-loop-convert.MaxCopySize
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
mark_as_advanced(CUDF_BUILD_TESTUTIL)
option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
option(
Expand Down
9 changes: 2 additions & 7 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)

# ##################################################################################################
# * ast benchmark ---------------------------------------------------------------------------------
ConfigureBench(AST_BENCH ast/transform.cpp)
ConfigureNVBench(AST_NVBENCH ast/transform.cpp)

# ##################################################################################################
# * binaryop benchmark ----------------------------------------------------------------------------
ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)

# ##################################################################################################
# * nvtext benchmark -------------------------------------------------------------------
Expand Down Expand Up @@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)

# ##################################################################################################
# * multi buffer memset benchmark
# ----------------------------------------------------------------------
ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)

# ##################################################################################################
# * io benchmark ---------------------------------------------------------------------
ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
Expand Down
51 changes: 17 additions & 34 deletions cpp/benchmarks/ast/transform.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,14 +15,16 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/transform.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <list>
#include <memory>
Expand All @@ -35,13 +37,10 @@ enum class TreeType {
};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
class AST : public cudf::benchmark {};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
static void BM_ast_transform(benchmark::State& state)
static void BM_ast_transform(nvbench::state& state)
{
auto const table_size{static_cast<cudf::size_type>(state.range(0))};
auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));

// Create table data
auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
Expand Down Expand Up @@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state)

auto const& expression_tree_root = expressions.back();

// Execute benchmark
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf::compute_column(table, expression_tree_root);
}

// Use the number of bytes read from global memory
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
(tree_levels + 1) * sizeof(key_type));
}
state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \
(::benchmark::State & st) \
static void name(::nvbench::state& st) \
{ \
BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); \
::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); \
} \
BENCHMARK_REGISTER_F(AST, name) \
->Apply(CustomRanges) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("tree_levels", {1, 5, 10}) \
.add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
Expand Down
65 changes: 19 additions & 46 deletions cpp/benchmarks/binaryop/binaryop.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,15 +15,14 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/binaryop.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <vector>

// This set of benchmarks is designed to be a comparison for the AST benchmarks

Expand All @@ -33,23 +32,21 @@ enum class TreeType {
};

template <typename key_type, TreeType tree_type, bool reuse_columns>
class BINARYOP : public cudf::benchmark {};

template <typename key_type, TreeType tree_type, bool reuse_columns>
static void BM_binaryop_transform(benchmark::State& state)
static void BM_binaryop_transform(nvbench::state& state)
{
auto const table_size{static_cast<cudf::size_type>(state.range(0))};
auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};

// Create table data
auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
auto const source_table = create_sequence_table(
cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
cudf::table_view table{*source_table};

// Execute benchmark
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
// Use the number of bytes read from global memory
state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
// Execute tree that chains additions like (((a + b) + c) + d)
auto const op = cudf::binary_operator::ADD;
auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
Expand All @@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state)
result = cudf::binary_operation(result->view(), col, op, result_data_type);
});
}
}

// Use the number of bytes read from global memory
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
(tree_levels + 1) * sizeof(key_type));
});
}

#define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \
(::benchmark::State & st) { BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); }
\
static void name(::nvbench::state& st) \
{ \
BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); \
} \
NVBENCH_BENCH(name) \
.add_int64_axis("tree_levels", {1, 2, 5, 10}) \
.add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})

BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
int32_t,
Expand All @@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
double,
TreeType::IMBALANCED_LEFT,
false);

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();
Loading

0 comments on commit e7ae673

Please sign in to comment.