Skip to content

Commit

Permalink
Merge branch 'branch-24.12' of https://github.com/rapidsai/cudf into …
Browse files Browse the repository at this point in the history
…fea-remove-cudamemcpy-io
  • Loading branch information
vuule committed Oct 21, 2024
2 parents 75dc549 + 074ab74 commit 6b9881b
Show file tree
Hide file tree
Showing 55 changed files with 1,428 additions and 1,199 deletions.
5 changes: 5 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,13 @@ add_library(
src/filling/repeat.cu
src/filling/sequence.cu
src/groupby/groupby.cu
src/groupby/hash/compute_groupby.cu
src/groupby/hash/compute_single_pass_aggs.cu
src/groupby/hash/create_sparse_results_table.cu
src/groupby/hash/flatten_single_pass_aggs.cpp
src/groupby/hash/groupby.cu
src/groupby/hash/hash_compound_agg_finalizer.cu
src/groupby/hash/sparse_to_dense_results.cu
src/groupby/sort/aggregate.cpp
src/groupby/sort/group_argmax.cu
src/groupby/sort/group_argmin.cu
Expand Down
2 changes: 1 addition & 1 deletion cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ any type that cudf supports. For example, a `list_scalar` representing a list of
|Value type|Scalar class|Notes|
|-|-|-|
|fixed-width|`fixed_width_scalar<T>`| `T` can be any fixed-width type|
|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`|
|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int64_t`, `float` or `double`|
|fixed-point|`fixed_point_scalar<T>` | `T` can be `numeric::decimal32` or `numeric::decimal64`|
|timestamp|`timestamp_scalar<T>` | `T` can be `timestamp_D`, `timestamp_s`, etc.|
|duration|`duration_scalar<T>` | `T` can be `duration_D`, `duration_s`, etc.|
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/copy_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
Expand All @@ -36,7 +37,6 @@

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

Expand Down Expand Up @@ -256,7 +256,7 @@ struct scatter_gather_functor {

cudf::detail::grid_1d grid{input.size(), block_size, per_thread};

rmm::device_scalar<cudf::size_type> null_count{0, stream};
cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
if (output.nullable()) {
// Have to initialize the output mask to all zeros because we may update
// it with atomicOr().
Expand Down
5 changes: 2 additions & 3 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/device_scalar.hpp>

#include <cuda/std/optional>
#include <thrust/iterator/iterator_traits.h>

Expand Down Expand Up @@ -171,7 +170,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,

// if we have validity in the output
if (nullable) {
rmm::device_scalar<size_type> valid_count{0, stream};
cudf::detail::device_scalar<size_type> valid_count{0, stream};

// call the kernel
copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, true>
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/copy_range.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
Expand All @@ -27,7 +28,6 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

#include <cub/cub.cuh>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin,
auto grid = cudf::detail::grid_1d{num_items, block_size, 1};

if (target.nullable()) {
rmm::device_scalar<size_type> null_count(target.null_count(), stream);
cudf::detail::device_scalar<size_type> null_count(target.null_count(), stream);

auto kernel =
copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, true>;
Expand Down
103 changes: 103 additions & 0 deletions cpp/include/cudf/detail/device_scalar.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/detail/utilities/host_vector.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/resource_ref.hpp>

namespace CUDF_EXPORT cudf {
namespace detail {

template <typename T>
class device_scalar : public rmm::device_scalar<T> {
public:
#ifdef __CUDACC__
#pragma nv_exec_check_disable
#endif
~device_scalar() = default;

// Implementation is the same as what compiler should generate
// Could not use default move constructor as 11.8 compiler fails to generate it
#ifdef __CUDACC__
#pragma nv_exec_check_disable
#endif
device_scalar(device_scalar&& other) noexcept
: rmm::device_scalar<T>{std::move(other)}, bounce_buffer{std::move(other.bounce_buffer)}
{
}
device_scalar& operator=(device_scalar&&) noexcept = default;

device_scalar(device_scalar const&) = delete;
device_scalar& operator=(device_scalar const&) = delete;

device_scalar() = delete;

explicit device_scalar(
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
: rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
{
}

explicit device_scalar(
T const& initial_value,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
: rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
{
bounce_buffer[0] = initial_value;
cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
}

device_scalar(device_scalar const& other,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
: rmm::device_scalar<T>(other, stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
{
}

[[nodiscard]] T value(rmm::cuda_stream_view stream) const
{
cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
return bounce_buffer[0];
}

void set_value_async(T const& value, rmm::cuda_stream_view stream)
{
bounce_buffer[0] = value;
cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
}

void set_value_async(T&& value, rmm::cuda_stream_view stream)
{
bounce_buffer[0] = std::move(value);
cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
}

void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); }

private:
mutable cudf::detail::host_vector<T> bounce_buffer;
};

} // namespace detail
} // namespace CUDF_EXPORT cudf
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/detail/valid_if.cuh>
Expand All @@ -25,7 +26,6 @@
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/exec_policy.hpp>

#include <cub/block/block_reduce.cuh>
Expand Down Expand Up @@ -165,7 +165,7 @@ size_type inplace_bitmask_binop(Binop op,
"Mask pointer cannot be null");

rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
rmm::device_scalar<size_type> d_counter{0, stream, mr};
cudf::detail::device_scalar<size_type> d_counter{0, stream, mr};
rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/types.hpp>
Expand All @@ -25,7 +26,6 @@
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

#include <thrust/distance.h>

Expand Down Expand Up @@ -101,7 +101,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,

size_type null_count{0};
if (size > 0) {
rmm::device_scalar<size_type> valid_count{0, stream};
cudf::detail::device_scalar<size_type> valid_count{0, stream};

constexpr size_type block_size{256};
grid_1d grid{size, block_size};
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/cudf/scalar/scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -94,8 +95,8 @@ class scalar {
[[nodiscard]] bool const* validity_data() const;

protected:
data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar
rmm::device_scalar<bool> _is_valid; ///< Device bool signifying validity
data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar
cudf::detail::device_scalar<bool> _is_valid; ///< Device bool signifying validity

/**
* @brief Move constructor for scalar.
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/null_mask.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
Expand All @@ -32,7 +33,6 @@

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <cub/cub.cuh>
Expand Down Expand Up @@ -329,7 +329,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,

cudf::detail::grid_1d grid(num_words, block_size);

rmm::device_scalar<size_type> non_zero_count(0, stream);
cudf::detail::device_scalar<size_type> non_zero_count(0, stream);

count_set_bits_kernel<block_size>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/concatenate.hpp>
#include <cudf/detail/concatenate_masks.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
Expand Down Expand Up @@ -162,7 +163,7 @@ size_type concatenate_masks(device_span<column_device_view const> d_views,
size_type output_size,
rmm::cuda_stream_view stream)
{
rmm::device_scalar<size_type> d_valid_count(0, stream);
cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
constexpr size_type block_size{256};
cudf::detail::grid_1d config(output_size, block_size);
concatenate_masks_kernel<block_size>
Expand Down Expand Up @@ -265,7 +266,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
auto out_view = out_col->mutable_view();
auto d_out_view = mutable_column_device_view::create(out_view, stream);

rmm::device_scalar<size_type> d_valid_count(0, stream);
cudf::detail::device_scalar<size_type> d_valid_count(0, stream);

// Launch kernel
constexpr size_type block_size{256};
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/copying/get_element.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/copying.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/is_element_valid.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
Expand Down Expand Up @@ -71,7 +72,7 @@ struct get_element_functor {
auto device_col = column_device_view::create(input, stream);

rmm::device_scalar<string_view> temp_data(stream, mr);
rmm::device_scalar<bool> temp_valid(stream, mr);
cudf::detail::device_scalar<bool> temp_valid(stream, mr);

device_single_thread(
[buffer = temp_data.data(),
Expand Down Expand Up @@ -155,8 +156,8 @@ struct get_element_functor {

auto device_col = column_device_view::create(input, stream);

rmm::device_scalar<Type> temp_data(stream, mr);
rmm::device_scalar<bool> temp_valid(stream, mr);
cudf::detail::device_scalar<Type> temp_data(stream, mr);
cudf::detail::device_scalar<bool> temp_valid(stream, mr);

device_single_thread(
[buffer = temp_data.data(),
Expand Down
Loading

0 comments on commit 6b9881b

Please sign in to comment.