Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.10' into simplify-re…
Browse files Browse the repository at this point in the history
…mote-io
  • Loading branch information
rjzamora committed Aug 13, 2024
2 parents 00c47fa + 3a791cb commit 491c140
Show file tree
Hide file tree
Showing 194 changed files with 4,326 additions and 1,431 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ Debug
build/
cpp/build/
cpp/examples/*/install/
cpp/examples/*/build/
cpp/examples/tpch/datagen/datafusion
cpp/include/cudf/ipc_generated/*.h
cpp/thirdparty/googletest/

Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ repos:
- id: ruff-format
files: python/.*$
- repo: https://github.com/rapidsai/pre-commit-hooks
rev: v0.2.0
rev: v0.3.1
hooks:
- id: verify-copyright
exclude: |
Expand Down
376 changes: 376 additions & 0 deletions CHANGELOG.md

Large diffs are not rendered by default.

8 changes: 3 additions & 5 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,16 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}

#Get <major>.<minor> for next version
# Get <major>.<minor> for next version
NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"

# Need to distutils-normalize the versions for some use cases
CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"

echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"

Expand Down Expand Up @@ -61,7 +59,7 @@ for DEP in "${DEPENDENCIES[@]}"; do
sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
done
for FILE in python/*/pyproject.toml; do
sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
done
done

Expand All @@ -77,7 +75,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
# CI files
for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
done
sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh

Expand Down
3 changes: 0 additions & 3 deletions ci/run_cudf_memcheck_ctests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ export LIBCUDF_MEMCHECK_ENABLED=1
for gt in ./*_TEST ; do
test_name=$(basename ${gt})
# Run gtests with compute-sanitizer
if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
continue
fi
echo "Running compute-sanitizer on $test_name"
compute-sanitizer --tool memcheck ${gt} "$@"
done
Expand Down
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- dlpack>=0.8,<1.0
- doxygen=1.9.1
- fastavro>=0.22.9
- flatbuffers==24.3.25
- fmt>=10.1.1,<11
- fsspec>=0.6.0
- gcc_linux-64=11.*
Expand Down Expand Up @@ -81,7 +82,7 @@ dependencies:
- rich
- rmm==24.10.*,>=0.0.0a0
- s3fs>=2022.3.0
- scikit-build-core>=0.7.0
- scikit-build-core>=0.10.0
- scipy
- spdlog>=1.12.0,<1.13
- sphinx
Expand Down
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies:
- dlpack>=0.8,<1.0
- doxygen=1.9.1
- fastavro>=0.22.9
- flatbuffers==24.3.25
- fmt>=10.1.1,<11
- fsspec>=0.6.0
- gcc_linux-64=11.*
Expand Down Expand Up @@ -79,7 +80,7 @@ dependencies:
- rich
- rmm==24.10.*,>=0.0.0a0
- s3fs>=2022.3.0
- scikit-build-core>=0.7.0
- scikit-build-core>=0.10.0
- scipy
- spdlog>=1.12.0,<1.13
- sphinx
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ requirements:
- python
- cython >=3.0.3
- rapids-build-backend >=0.3.0,<0.4.0.dev0
- scikit-build-core >=0.7.0
- scikit-build-core >=0.10.0
- dlpack >=0.8,<1.0
# TODO: Change to `2.0` for NumPy 2
- numpy 1.23
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ requirements:
- cudf ={{ version }}
- libcudf_kafka ={{ version }}
- rapids-build-backend >=0.3.0,<0.4.0.dev0
- scikit-build-core >=0.7.0
- scikit-build-core >=0.10.0
{% if cuda_major != "11" %}
- cuda-cudart-dev
{% endif %}
Expand Down
3 changes: 3 additions & 0 deletions conda/recipes/libcudf/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ librdkafka_version:
fmt_version:
- ">=10.1.1,<11"

flatbuffers_version:
- "=24.3.25"

spdlog_version:
- ">=1.12.0,<1.13"

Expand Down
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ requirements:
- dlpack {{ dlpack_version }}
- librdkafka {{ librdkafka_version }}
- fmt {{ fmt_version }}
- flatbuffers {{ flatbuffers_version }}
- spdlog {{ spdlog_version }}
- zlib {{ zlib_version }}

Expand Down
1 change: 0 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,6 @@ add_library(
src/io/csv/reader_impl.cu
src/io/csv/writer_impl.cu
src/io/functions.cpp
src/io/json/byte_range_info.cu
src/io/json/json_column.cu
src/io/json/json_normalization.cu
src/io/json/json_tree.cu
Expand Down
6 changes: 6 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ ConfigureNVBench(
stream_compaction/distinct.cpp
stream_compaction/distinct_count.cpp
stream_compaction/stable_distinct.cpp
stream_compaction/stream_compaction_common.cpp
stream_compaction/unique.cpp
stream_compaction/unique_count.cpp
)
Expand Down Expand Up @@ -353,6 +354,11 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)

# ##################################################################################################
# * multi buffer memset benchmark
# ----------------------------------------------------------------------
ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)

# ##################################################################################################
# * io benchmark ---------------------------------------------------------------------
ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
Expand Down
101 changes: 101 additions & 0 deletions cpp/benchmarks/io/utilities/batched_memset_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/io/parquet.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
// run on most GPUs, but large enough to allow highest throughput
constexpr size_t data_size = 512 << 20;

void parquet_read_common(cudf::size_type num_rows_to_read,
cudf::size_type num_cols_to_read,
cuio_source_sink_pair& source_sink,
nvbench::state& state)
{
cudf::io::parquet_reader_options read_opts =
cudf::io::parquet_reader_options::builder(source_sink.make_source_info());

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();

timer.start();
auto const result = cudf::io::read_parquet(read_opts);
timer.stop();

CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
});

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

template <data_type DataType>
void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = cudf::io::compression_type::NONE;
cuio_source_sink_pair source_sink(source_type);
auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);
cudf::io::write_parquet(write_opts);
auto const num_rows = view.num_rows();

parquet_read_common(num_rows, num_cols, source_sink, state);
}

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::FLOAT,
data_type::DECIMAL,
data_type::TIMESTAMP,
data_type::DURATION,
data_type::STRING,
data_type::LIST,
data_type::STRUCT>;

NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
.set_name("batched_memset")
.set_type_axes_names({"data_type"})
.add_int64_axis("num_cols", {1000})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});
77 changes: 0 additions & 77 deletions cpp/benchmarks/iterator/iterator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/pair.h>
#include <thrust/reduce.h>

#include <random>
Expand Down Expand Up @@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state)
sizeof(TypeParam));
}

// operator+ defined for pair iterator reduction
template <typename T>
__device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pair<T, bool> rhs)
{
return thrust::pair<T, bool>{lhs.first * lhs.second + rhs.first * rhs.second,
lhs.second + rhs.second};
}
// -----------------------------------------------------------------------------
template <typename T, bool has_null>
void pair_iterator_bench_cub(cudf::column_view& col,
rmm::device_uvector<thrust::pair<T, bool>>& result)
{
thrust::pair<T, bool> init{0, false};
auto d_col = cudf::column_device_view::create(col);
int num_items = col.size();
auto begin = d_col->pair_begin<T, has_null>();
reduce_by_cub(result.begin(), begin, num_items, init);
}

template <typename T, bool has_null>
void pair_iterator_bench_thrust(cudf::column_view& col,
rmm::device_uvector<thrust::pair<T, bool>>& result)
{
thrust::pair<T, bool> init{0, false};
auto d_col = cudf::column_device_view::create(col);
auto d_in = d_col->pair_begin<T, has_null>();
auto d_end = d_in + col.size();
thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{});
}

template <class TypeParam, bool cub_or_thrust>
void BM_pair_iterator(benchmark::State& state)
{
cudf::size_type const column_size{(cudf::size_type)state.range(0)};
using T = TypeParam;
auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
auto null_gen =
thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; });

cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_T(
num_gen, num_gen + column_size, null_gen);
cudf::column_view hasnull_F = wrap_hasnull_F;
cudf::column_view hasnull_T = wrap_hasnull_T;

// Initialize dev_result to false
auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
if (cub_or_thrust) {
pair_iterator_bench_cub<T, false>(hasnull_T,
dev_result); // driven by pair iterator with nulls
} else {
pair_iterator_bench_thrust<T, false>(hasnull_T,
dev_result); // driven by pair iterator with nulls
}
}
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
sizeof(TypeParam));
}

#define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \
BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
{ \
Expand All @@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true);
ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false);
ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true);
ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false);

#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust) \
BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
{ \
BM_pair_iterator<type, cub_or_thrust>(state); \
} \
BENCHMARK_REGISTER_F(Iterator, name) \
->RangeMultiplier(10) \
->Range(1000, 10000000) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true);
PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false);
Loading

0 comments on commit 491c140

Please sign in to comment.