Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some additional kernel thread index refactoring. #14107

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand All @@ -34,7 +35,7 @@

CUDF_KERNEL void init_curand(curandState* state, int const nstates)
{
int ithread = threadIdx.x + blockIdx.x * blockDim.x;
int ithread = cudf::detail::grid_1d::global_thread_id();

if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
}
Expand All @@ -46,13 +47,14 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
curandState* state,
int const num_states)
{
auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const stride = blockDim.x * gridDim.x;
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
assert(start_idx < num_states);

curandState localState = state[start_idx];

for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
for (thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a range-based for loop to replace this pattern?

for(auto const tidx: grid_1d::thread_indices()) {...}

So we won't need to have stride like this.

Copy link
Contributor Author

@bdice bdice May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now I'm going to leave that out of scope for this PR. I want to tighten the scope here and merge this since it's fairly old. Please feel free to comment on #10368 -- the topic of a range-based loop has already come up there.

auto const idx = static_cast<size_type>(tidx);
double const x = curand_uniform_double(&localState);

build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
Expand All @@ -71,13 +73,14 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
curandState* state,
int const num_states)
{
auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const stride = blockDim.x * gridDim.x;
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
assert(start_idx < num_states);

curandState localState = state[start_idx];

for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) {
for (thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) {
auto const idx = static_cast<size_type>(tidx);
key_type val;
double x = curand_uniform_double(&localState);

Expand Down
32 changes: 18 additions & 14 deletions cpp/benchmarks/type_dispatcher/type_dispatcher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,26 +60,30 @@ constexpr int block_size = 256;
template <FunctorType functor_type, class T>
CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
{
using F = Functor<T, functor_type>;
cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
while (index < n_rows) {
using F = Functor<T, functor_type>;
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
while (tidx < n_rows) {
auto const index = static_cast<cudf::size_type>(tid);
for (int c = 0; c < n_cols; c++) {
A[c][index] = F::f(A[c][index]);
}
index += blockDim.x * gridDim.x;
tidx += stride;
}
}

// This is for HOST_DISPATCHING
template <FunctorType functor_type, class T>
CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
{
using F = Functor<T, functor_type>;
T* A = source_column.data<T>();
cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
while (index < source_column.size()) {
A[index] = F::f(A[index]);
index += blockDim.x * gridDim.x;
using F = Functor<T, functor_type>;
T* A = source_column.data<T>();
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
while (tidx < source_column.size()) {
auto const index = static_cast<cudf::size_type>(tid);
A[index] = F::f(A[index]);
tidx += stride;
}
}

Expand Down Expand Up @@ -127,14 +131,14 @@ template <FunctorType functor_type>
CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
{
cudf::size_type const n_rows = source.num_rows();
cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x;

while (index < n_rows) {
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
while (tidx < n_rows) {
for (cudf::size_type i = 0; i < source.num_columns(); i++) {
cudf::type_dispatcher(
source.column(i).type(), RowHandle<functor_type>{}, source.column(i), index);
}
index += blockDim.x * gridDim.x;
tidx += stride;
} // while
}

Expand Down
23 changes: 12 additions & 11 deletions cpp/include/cudf/detail/copy_if_else.cuh
bdice marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -45,29 +45,30 @@ __launch_bounds__(block_size) CUDF_KERNEL
mutable_column_device_view out,
size_type* __restrict__ const valid_count)
{
size_type const tid = threadIdx.x + blockIdx.x * block_size;
int const warp_id = tid / warp_size;
size_type const warps_per_grid = gridDim.x * block_size / warp_size;
auto tidx = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
int const warp_id = tidx / cudf::detail::warp_size;
size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;

// begin/end indices for the column data
size_type begin = 0;
size_type end = out.size();
size_type const begin = 0;
size_type const end = out.size();
// warp indices. since 1 warp == 32 threads == sizeof(bitmask_type) * 8,
// each warp will process one (32 bit) of the validity mask via
// __ballot_sync()
size_type warp_begin = cudf::word_index(begin);
size_type warp_end = cudf::word_index(end - 1);
size_type const warp_begin = cudf::word_index(begin);
size_type const warp_end = cudf::word_index(end - 1);
mythrocks marked this conversation as resolved.
Show resolved Hide resolved

// lane id within the current warp
constexpr size_type leader_lane{0};
int const lane_id = threadIdx.x % warp_size;
int const lane_id = threadIdx.x % cudf::detail::warp_size;

size_type warp_valid_count{0};

// current warp.
size_type warp_cur = warp_begin + warp_id;
size_type index = tid;
while (warp_cur <= warp_end) {
auto const index = static_cast<size_type>(tidx);
auto const opt_value =
(index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
Expand All @@ -85,7 +86,7 @@ __launch_bounds__(block_size) CUDF_KERNEL

// next grid
warp_cur += warps_per_grid;
index += block_size * gridDim.x;
tidx += stride;
}

if (has_nulls) {
Expand Down Expand Up @@ -159,7 +160,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;

size_type size = std::distance(lhs_begin, lhs_end);
size_type num_els = cudf::util::round_up_safe(size, warp_size);
size_type num_els = cudf::util::round_up_safe(size, cudf::detail::warp_size);
constexpr int block_size = 256;
cudf::detail::grid_1d grid{num_els, block_size, 1};

Expand Down
26 changes: 26 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ class grid_1d {
return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
}

/**
* @brief Returns the global thread index of the current thread in a 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The global thread index
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type global_thread_id()
{
return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
}

/**
* @brief Returns the stride of a 1D grid.
*
Expand All @@ -115,6 +128,19 @@ class grid_1d {
* @return thread_index_type The number of threads in the grid.
*/
static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }

/**
* @brief Returns the stride of the current 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The number of threads in the grid.
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type grid_stride()
{
return grid_stride(num_threads_per_block, gridDim.x);
}
};

/**
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto i = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,

auto const first_word_index{word_index(first_bit_index)};
auto const last_word_index{word_index(last_bit_index)};
thread_index_type const tid = grid_1d::global_thread_id();
thread_index_type const stride = grid_1d::grid_stride();
thread_index_type const tid = grid_1d::global_thread_id<block_size>();
thread_index_type const stride = grid_1d::grid_stride<block_size>();
thread_index_type thread_word_index = tid + first_word_index;
size_type thread_count{0};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
size_type number_of_mask_bits,
size_type* out_valid_count)
{
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto tidx = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
auto active_mask = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);

size_type warp_valid_count = 0;
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/join/conditional_join_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size(
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];

std::size_t thread_counter{0};
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();

cudf::thread_index_type const left_num_rows = left_table.num_rows();
cudf::thread_index_type const right_num_rows = right_table.num_rows();
Expand Down Expand Up @@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,

__syncwarp();

auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>();

unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);

Expand Down Expand Up @@ -295,8 +295,8 @@ CUDF_KERNEL void conditional_join_anti_semi(
int const lane_id = threadIdx.x % detail::warp_size;
cudf::thread_index_type const outer_num_rows = left_table.num_rows();
cudf::thread_index_type const inner_num_rows = right_table.num_rows();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();

if (0 == lane_id) { current_idx_shared[warp_id] = 0; }

Expand Down
18 changes: 10 additions & 8 deletions cpp/src/strings/convert/convert_urls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
__shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
__shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];

auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const global_thread_id =
cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

Expand Down Expand Up @@ -287,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
__shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
__shared__ size_type out_idx[num_warps_per_threadblock];

auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const global_thread_id =
cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

Expand Down
Loading