Skip to content

Commit

Permalink
Some additional kernel thread index refactoring. (#14107)
Browse files Browse the repository at this point in the history
This PR refactors a few kernels to use `thread_index_type` and associated utilities. I started this before realizing how much scope was still left in issue #10368 ("Part 2 - Take another pass over more challenging kernels"), and then I stopped working on this due to time constraints. For the moment, I hope this PR makes a small dent in the number of remaining kernels to convert to using `thread_index_type`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: #14107
  • Loading branch information
bdice authored May 7, 2024
1 parent e87a78d commit a958274
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 51 deletions.
17 changes: 10 additions & 7 deletions cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand All @@ -34,7 +35,7 @@

CUDF_KERNEL void init_curand(curandState* state, int const nstates)
{
int ithread = threadIdx.x + blockIdx.x * blockDim.x;
int ithread = cudf::detail::grid_1d::global_thread_id();

if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
}
Expand All @@ -46,13 +47,14 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
curandState* state,
int const num_states)
{
auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const stride = blockDim.x * gridDim.x;
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
assert(start_idx < num_states);

curandState localState = state[start_idx];

for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
for (cudf::thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) {
auto const idx = static_cast<size_type>(tidx);
double const x = curand_uniform_double(&localState);

build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
Expand All @@ -71,13 +73,14 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
curandState* state,
int const num_states)
{
auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const stride = blockDim.x * gridDim.x;
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
assert(start_idx < num_states);

curandState localState = state[start_idx];

for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) {
for (cudf::thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) {
auto const idx = static_cast<size_type>(tidx);
key_type val;
double x = curand_uniform_double(&localState);

Expand Down
33 changes: 19 additions & 14 deletions cpp/benchmarks/type_dispatcher/type_dispatcher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,26 +60,30 @@ constexpr int block_size = 256;
template <FunctorType functor_type, class T>
CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
{
using F = Functor<T, functor_type>;
cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
while (index < n_rows) {
using F = Functor<T, functor_type>;
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
while (tidx < n_rows) {
auto const index = static_cast<cudf::size_type>(tidx);
for (int c = 0; c < n_cols; c++) {
A[c][index] = F::f(A[c][index]);
}
index += blockDim.x * gridDim.x;
tidx += stride;
}
}

// This is for HOST_DISPATCHING
template <FunctorType functor_type, class T>
CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
{
using F = Functor<T, functor_type>;
T* A = source_column.data<T>();
cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
while (index < source_column.size()) {
A[index] = F::f(A[index]);
index += blockDim.x * gridDim.x;
using F = Functor<T, functor_type>;
T* A = source_column.data<T>();
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
while (tidx < source_column.size()) {
auto const index = static_cast<cudf::size_type>(tidx);
A[index] = F::f(A[index]);
tidx += stride;
}
}

Expand Down Expand Up @@ -127,14 +131,15 @@ template <FunctorType functor_type>
CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
{
cudf::size_type const n_rows = source.num_rows();
cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x;

while (index < n_rows) {
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
while (tidx < n_rows) {
auto const index = static_cast<cudf::size_type>(tidx);
for (cudf::size_type i = 0; i < source.num_columns(); i++) {
cudf::type_dispatcher(
source.column(i).type(), RowHandle<functor_type>{}, source.column(i), index);
}
index += blockDim.x * gridDim.x;
tidx += stride;
} // while
}

Expand Down
23 changes: 12 additions & 11 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,29 +45,30 @@ __launch_bounds__(block_size) CUDF_KERNEL
mutable_column_device_view out,
size_type* __restrict__ const valid_count)
{
size_type const tid = threadIdx.x + blockIdx.x * block_size;
int const warp_id = tid / warp_size;
size_type const warps_per_grid = gridDim.x * block_size / warp_size;
auto tidx = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
int const warp_id = tidx / cudf::detail::warp_size;
size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;

// begin/end indices for the column data
size_type begin = 0;
size_type end = out.size();
size_type const begin = 0;
size_type const end = out.size();
// warp indices. since 1 warp == 32 threads == sizeof(bitmask_type) * 8,
// each warp will process one (32 bit) of the validity mask via
// __ballot_sync()
size_type warp_begin = cudf::word_index(begin);
size_type warp_end = cudf::word_index(end - 1);
size_type const warp_begin = cudf::word_index(begin);
size_type const warp_end = cudf::word_index(end - 1);

// lane id within the current warp
constexpr size_type leader_lane{0};
int const lane_id = threadIdx.x % warp_size;
int const lane_id = threadIdx.x % cudf::detail::warp_size;

size_type warp_valid_count{0};

// current warp.
size_type warp_cur = warp_begin + warp_id;
size_type index = tid;
while (warp_cur <= warp_end) {
auto const index = static_cast<size_type>(tidx);
auto const opt_value =
(index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
Expand All @@ -85,7 +86,7 @@ __launch_bounds__(block_size) CUDF_KERNEL

// next grid
warp_cur += warps_per_grid;
index += block_size * gridDim.x;
tidx += stride;
}

if (has_nulls) {
Expand Down Expand Up @@ -159,7 +160,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;

size_type size = std::distance(lhs_begin, lhs_end);
size_type num_els = cudf::util::round_up_safe(size, warp_size);
size_type num_els = cudf::util::round_up_safe(size, cudf::detail::warp_size);
constexpr int block_size = 256;
cudf::detail::grid_1d grid{num_els, block_size, 1};

Expand Down
26 changes: 26 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ class grid_1d {
return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
}

/**
* @brief Returns the global thread index of the current thread in a 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The global thread index
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type global_thread_id()
{
return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
}

/**
* @brief Returns the stride of a 1D grid.
*
Expand All @@ -115,6 +128,19 @@ class grid_1d {
* @return thread_index_type The number of threads in the grid.
*/
static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }

/**
* @brief Returns the stride of the current 1D grid.
*
* @tparam num_threads_per_block The number of threads per block
*
* @return thread_index_type The number of threads in the grid.
*/
template <thread_index_type num_threads_per_block>
static __device__ thread_index_type grid_stride()
{
return grid_stride(num_threads_per_block, gridDim.x);
}
};

/**
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto i = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,

auto const first_word_index{word_index(first_bit_index)};
auto const last_word_index{word_index(last_bit_index)};
thread_index_type const tid = grid_1d::global_thread_id();
thread_index_type const stride = grid_1d::grid_stride();
thread_index_type const tid = grid_1d::global_thread_id<block_size>();
thread_index_type const stride = grid_1d::grid_stride<block_size>();
thread_index_type thread_word_index = tid + first_word_index;
size_type thread_count{0};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
size_type number_of_mask_bits,
size_type* out_valid_count)
{
auto tidx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto tidx = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
auto active_mask = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);

size_type warp_valid_count = 0;
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/join/conditional_join_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size(
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];

std::size_t thread_counter{0};
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();

cudf::thread_index_type const left_num_rows = left_table.num_rows();
cudf::thread_index_type const right_num_rows = right_table.num_rows();
Expand Down Expand Up @@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,

__syncwarp();

auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>();

unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);

Expand Down Expand Up @@ -295,8 +295,8 @@ CUDF_KERNEL void conditional_join_anti_semi(
int const lane_id = threadIdx.x % detail::warp_size;
cudf::thread_index_type const outer_num_rows = left_table.num_rows();
cudf::thread_index_type const inner_num_rows = right_table.num_rows();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto const start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();

if (0 == lane_id) { current_idx_shared[warp_id] = 0; }

Expand Down
18 changes: 10 additions & 8 deletions cpp/src/strings/convert/convert_urls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
__shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
__shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];

auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const global_thread_id =
cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

Expand Down Expand Up @@ -287,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
__shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
__shared__ size_type out_idx[num_warps_per_threadblock];

auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const global_thread_id =
cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

Expand Down

0 comments on commit a958274

Please sign in to comment.