Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add device subsets example #346

Merged
merged 34 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
f6f37fa
Add device subsets example
PointKernel Jul 3, 2023
d35db89
Merge remote-tracking branch 'upstream/dev' into subset-example
PointKernel Aug 5, 2023
482a14e
Cleanups: fix typo, remove printf etc
PointKernel Aug 5, 2023
8e559e0
Remove unrelated code
PointKernel Aug 5, 2023
40b9b15
Merge remote-tracking branch 'upstream/dev' into subset-example
PointKernel Aug 8, 2023
fda8f88
Add default extent template parameter
PointKernel Aug 8, 2023
2c92f84
Add missing headers
PointKernel Aug 9, 2023
233c668
Add missing header + temporarily disable asserts
PointKernel Aug 9, 2023
4eb25c9
Update subset example
PointKernel Aug 9, 2023
b68761d
Merge remote-tracking branch 'upstream/dev' into subset-example
PointKernel Aug 10, 2023
56b5dc3
Update example
PointKernel Aug 10, 2023
eff6faa
Resolve merging conflict
PointKernel Aug 10, 2023
871424a
Add default parameters to aow_storage for convenience
PointKernel Aug 10, 2023
635988b
Add storage initialized_async
PointKernel Aug 15, 2023
393ee3b
Update subset example
PointKernel Aug 15, 2023
70c3df7
Renaming
PointKernel Aug 15, 2023
085d1bb
Minor cleanups
PointKernel Aug 16, 2023
755db26
Add docs and comments
PointKernel Aug 17, 2023
8c746d7
Merge remote-tracking branch 'upstream/dev' into subset-example
PointKernel Sep 1, 2023
cce72b4
Merge remote-tracking branch 'upstream/dev' into subset-example
PointKernel Sep 6, 2023
b8028f4
Remove CGSize from window_extent
PointKernel Sep 6, 2023
d913720
Add more headers
PointKernel Sep 6, 2023
02eabf6
Temporarily disable window extent checks in open addressing ref base …
PointKernel Sep 11, 2023
3d016f6
Remove window_size tparam from window_extent
sleeepyjack Sep 12, 2023
5d88ea7
Add operator-agnostic static_set_ref move ctor and with() helper func…
sleeepyjack Sep 13, 2023
2433c09
Update device subset example
sleeepyjack Sep 13, 2023
c46d6fe
Partially re-enable checks
sleeepyjack Sep 13, 2023
adc6a54
Merge remote-tracking branch 'upstream/dev' into subset-example
PointKernel Sep 22, 2023
770a2ad
Merge remote-tracking branch 'origin/subset-example' into subset-example
PointKernel Sep 25, 2023
c632a9f
Remove window_extent static check and use size_t in the example
PointKernel Sep 26, 2023
2fa3810
With function to static_map_ref
PointKernel Sep 26, 2023
9e29ea7
Add TODO reminder
PointKernel Sep 26, 2023
8c8c3c1
Clean up example code
PointKernel Sep 26, 2023
2e7658c
Move the sentinel
PointKernel Sep 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ endfunction(ConfigureExample)

ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
Expand Down
16 changes: 8 additions & 8 deletions examples/static_set/device_ref_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
#include <cstddef>
#include <iostream>

/**
* @file device_reference_example.cu
* @brief Demonstrates usage of the static_set device-side APIs.
*
* static_set provides a non-owning reference which can be used to interact with
* the container from within device code.
*/

// insert a set of keys into a hash set using one cooperative group for each task
template <typename SetRef, typename InputIterator>
__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
Expand Down Expand Up @@ -60,14 +68,6 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
}
}

/**
* @file device_reference_example.cu
* @brief Demonstrates usage of the static_set device-side APIs.
*
* static_set provides a non-owning reference which can be used to interact with
* the container from within device code.
*
*/
int main(void)
{
using Key = int;
Expand Down
183 changes: 183 additions & 0 deletions examples/static_set/device_subsets_example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuco/static_set_ref.cuh>
#include <cuco/storage.cuh>

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>

#include <cooperative_groups.h>

#include <cuda/std/array>

#include <algorithm>
#include <cstddef>
#include <iostream>
#include <numeric>

/**
* @file device_subsets_example.cu
* @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform
* individual operations via device-side ref APIs.
*
* To optimize memory usage, especially when dealing with expensive data allocation and multiple
* hashsets, a practical solution involves employing a single bulk storage for generating subsets.
* This eliminates the need for separate memory allocation and deallocation for each container. This
* can be achieved by using the lightweight non-owning ref type.
*
* @note This example is for demonstration purposes only. It is not intended to show the most
* performant way to do the example algorithm.
*/

auto constexpr cg_size = 8; ///< A CUDA Cooperative Group of 8 threads to handle each subset
auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread
auto constexpr N = 10; ///< Number of elements to insert and query

using key_type = int; ///< Key type
using probing_scheme_type = cuco::experimental::linear_probing<
cg_size,
cuco::default_hash_function<key_type>>; ///< Type controls CG granularity and probing scheme
///< (linear probing v.s. double hashing)
/// Type of bulk allocation storage
using storage_type = cuco::experimental::aow_storage<key_type, window_size>;
/// Lightweight non-owning storage ref type
using storage_ref_type = typename storage_type::ref_type;
using ref_type = cuco::experimental::static_set_ref<key_type,
cuda::thread_scope_device,
thrust::equal_to<key_type>,
probing_scheme_type,
storage_ref_type>; ///< Set ref type

/// Sample data to insert and query
__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
/// that they never occur in your input data.
key_type constexpr empty_key_sentinel = -1;

/**
* @brief Inserts sample data into subsets by using cooperative group
*
* Each Cooperative Group creates its own subset and inserts `N` sample data.
*
* @param set_refs Pointer to the array of subset objects
*/
__global__ void insert(ref_type* set_refs)
{
namespace cg = cooperative_groups;

auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
// Get subset (or CG) index
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

auto raw_set_ref = *(set_refs + idx);
auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert);

// Insert `N` elemtns into the set with CG insert
for (int i = 0; i < N; i++) {
insert_set_ref.insert(tile, data[i]);
}
}

/**
* @brief All inserted data can be found
*
* Each Cooperative Group reconstructs its own subset ref based on the storage parameters and
* verifies all inserted data can be found.
*
* @param set_refs Pointer to the array of subset objects
*/
__global__ void find(ref_type* set_refs)
{
namespace cg = cooperative_groups;

auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

auto raw_set_ref = *(set_refs + idx);
auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find);

// Result denoting if any of the inserted data is not found
__shared__ int result;
if (threadIdx.x == 0) { result = 0; }
__syncthreads();

for (int i = 0; i < N; i++) {
// Query the set with inserted data
auto const found = find_set_ref.find(tile, data[i]);
// Record if the inserted data has been found
atomicOr(&result, *found != data[i]);
}
__syncthreads();

if (threadIdx.x == 0) {
// If the result is still 0, all inserted data are found.
if (result == 0) { printf("Success! Found all inserted elements.\n"); }
}
}

int main()
{
// Number of subsets to be created
auto constexpr num = 16;
// Each subset may have a different requested size
auto constexpr subset_sizes =
std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50};

auto valid_sizes = std::vector<std::size_t>();
valid_sizes.reserve(num);

for (size_t i = 0; i < num; ++i) {
valid_sizes.emplace_back(
static_cast<std::size_t>(cuco::experimental::make_window_extent<ref_type>(subset_sizes[i])));
}

std::vector<std::size_t> offsets(num + 1, 0);

// prefix sum to compute offsets and total number of windows
std::size_t current_sum = 0;
for (std::size_t i = 0; i < valid_sizes.size(); ++i) {
current_sum += valid_sizes[i];
offsets[i + 1] = current_sum;
}

// total number of windows is located at the back of the offsets array
auto const total_num_windows = offsets.back();

// Create a single bulk storage used by all subsets
auto set_storage = storage_type{total_num_windows};
// Initializes the storage with the given sentinel
set_storage.initialize(empty_key_sentinel);

std::vector<ref_type> set_refs;

// create subsets
for (std::size_t i = 0; i < num; ++i) {
storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]};
set_refs.emplace_back(
ref_type{cuco::empty_key<key_type>{empty_key_sentinel}, {}, {}, storage_ref});
}

thrust::device_vector<ref_type> d_set_refs(set_refs);

// Insert sample data
insert<<<1, 128>>>(d_set_refs.data().get());
// Find all inserted data
find<<<1, 128>>>(d_set_refs.data().get());

return 0;
}
23 changes: 17 additions & 6 deletions include/cuco/aow_storage.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#pragma once

#include <cuco/detail/storage/aow_storage_base.cuh>

#include <cuco/cuda_stream_ref.hpp>
#include <cuco/detail/storage/aow_storage_base.cuh>
#include <cuco/extent.cuh>
#include <cuco/utility/allocator.hpp>

#include <cuda/std/array>

Expand Down Expand Up @@ -47,7 +47,10 @@ class aow_storage_ref;
* @tparam Extent Type of extent denoting number of windows
* @tparam Allocator Type of allocator used for device storage (de)allocation
*/
template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
template <typename T,
int32_t WindowSize,
typename Extent = cuco::experimental::extent<std::size_t>,
typename Allocator = cuco::cuda_allocator<cuco::experimental::window<T, WindowSize>>>
class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
public:
using base_type = detail::aow_storage_base<T, WindowSize, Extent>; ///< AoW base class type
Expand Down Expand Up @@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @param size Number of windows to (de)allocate
* @param allocator Allocator used for (de)allocating device storage
*/
explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept;
explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;

aow_storage(aow_storage&&) = default; ///< Move constructor
/**
Expand Down Expand Up @@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @param key Key to which all keys in `slots` are initialized
* @param stream Stream used for executing the kernel
*/
void initialize(value_type key, cuda_stream_ref stream) noexcept;
void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;

/**
* @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
*
* @param key Key to which all keys in `slots` are initialized
* @param stream Stream used for executing the kernel
*/
void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;

private:
allocator_type allocator_; ///< Allocator used to (de)allocate windows
Expand All @@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @tparam WindowSize Number of slots in each window
* @tparam Extent Type of extent denoting storage capacity
*/
template <typename T, int32_t WindowSize, typename Extent>
template <typename T, int32_t WindowSize, typename Extent = cuco::experimental::extent<std::size_t>>
class aow_storage_ref : public detail::aow_storage_base<T, WindowSize, Extent> {
public:
using base_type = detail::aow_storage_base<T, WindowSize, Extent>; ///< AoW base class type
Expand Down
35 changes: 13 additions & 22 deletions include/cuco/detail/extent/extent.inl
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,10 @@
namespace cuco {
namespace experimental {

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N = dynamic_extent>
template <typename SizeType, std::size_t N>
struct window_extent {
using value_type = SizeType; ///< Extent value type

static auto constexpr cg_size = CGSize;
static auto constexpr window_size = WindowSize;

__host__ __device__ constexpr value_type value() const noexcept { return N; }
__host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); }

Expand All @@ -45,15 +42,11 @@ struct window_extent {
friend auto constexpr make_window_extent(extent<SizeType_, N_> ext);
};

template <int32_t CGSize, int32_t WindowSize, typename SizeType>
struct window_extent<CGSize, WindowSize, SizeType, dynamic_extent>
: cuco::utility::fast_int<SizeType> {
template <typename SizeType>
struct window_extent<SizeType, dynamic_extent> : cuco::utility::fast_int<SizeType> {
using value_type =
typename cuco::utility::fast_int<SizeType>::fast_int::value_type; ///< Extent value type

static auto constexpr cg_size = CGSize;
static auto constexpr window_size = WindowSize;

private:
using cuco::utility::fast_int<SizeType>::fast_int;

Expand All @@ -67,10 +60,10 @@ template <typename Container, typename SizeType, std::size_t N>
return make_window_extent<Container::cg_size, Container::window_size>(ext);
}

template <typename Container>
[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
template <typename Container, typename SizeType>
[[nodiscard]] auto constexpr make_window_extent(SizeType size)
{
return make_window_extent<Container::cg_size, Container::window_size>(size);
return make_window_extent<Container::cg_size, Container::window_size>(extent<SizeType>{size});
}

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
Expand All @@ -86,15 +79,13 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
if (size > max_value) { CUCO_FAIL("Invalid input extent"); }

if constexpr (N == dynamic_extent) {
return window_extent<CGSize, WindowSize, SizeType>{static_cast<SizeType>(
return window_extent<SizeType>{static_cast<SizeType>(
*cuco::detail::lower_bound(
cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast<uint64_t>(size)) *
CGSize)};
}
if constexpr (N != dynamic_extent) {
return window_extent<CGSize,
WindowSize,
SizeType,
return window_extent<SizeType,
static_cast<std::size_t>(
*cuco::detail::lower_bound(cuco::detail::primes.begin(),
cuco::detail::primes.end(),
Expand All @@ -103,10 +94,10 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
}
}

template <int32_t CGSize, int32_t WindowSize>
[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
template <int32_t CGSize, int32_t WindowSize, typename SizeType>
[[nodiscard]] auto constexpr make_window_extent(SizeType size)
{
return static_cast<std::size_t>(make_window_extent<CGSize, WindowSize>(extent{size}));
return make_window_extent<CGSize, WindowSize>(extent<SizeType>{size});
}

namespace detail {
Expand All @@ -115,8 +106,8 @@ template <typename...>
struct is_window_extent : std::false_type {
};

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
struct is_window_extent<window_extent<CGSize, WindowSize, SizeType, N>> : std::true_type {
template <typename SizeType, std::size_t N>
struct is_window_extent<window_extent<SizeType, N>> : std::true_type {
};

template <typename T>
Expand Down
8 changes: 2 additions & 6 deletions include/cuco/detail/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,7 @@ class open_addressing_impl {
*
* @param stream CUDA stream this operation is executed in
*/
void clear(cuda_stream_ref stream) noexcept
{
this->clear_async(stream);
stream.synchronize();
}
void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }

/**
* @brief Asynchronously erases all elements from the container. After this call, `size()` returns
Expand All @@ -155,7 +151,7 @@ class open_addressing_impl {
*/
void clear_async(cuda_stream_ref stream) noexcept
{
storage_.initialize(empty_slot_sentinel_, stream);
storage_.initialize_async(empty_slot_sentinel_, stream);
}

/**
Expand Down
Loading
Loading