Skip to content

Commit

Permalink
Merge branch 'branch-23.10' into column-size-bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Sep 1, 2023
2 parents c217d28 + d1fb671 commit 2a7ebd1
Show file tree
Hide file tree
Showing 14 changed files with 233 additions and 54 deletions.
2 changes: 1 addition & 1 deletion cpp/include/cudf/contiguous_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace cudf {
* @addtogroup column_copy
* @{
* @file
* @brief Table APIs for contiguous_split, pack, unpack, and metadadata
* @brief Table APIs for contiguous_split, pack, unpack, and metadata
*/

/**
Expand Down
23 changes: 23 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,29 @@ class grid_1d {
{
return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
}

/**
* @brief Returns the stride of a 1D grid.
*
* The returned stride is the total number of threads in the grid.
*
* @param thread_id The thread index within the block
* @param block_id The block index within the grid
* @param num_threads_per_block The number of threads per block
* @return thread_index_type The global thread index
*/
static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
thread_index_type num_blocks_per_grid)
{
return num_threads_per_block * num_blocks_per_grid;
}

/**
* @brief Returns the stride of the current 1D grid.
*
* @return thread_index_type The number of threads in the grid.
*/
static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
};

/**
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ __global__ void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
bitmask_type fill_value = valid ? 0xffff'ffff : 0;

thread_index_type const stride = blockDim.x * gridDim.x;
auto const stride = cudf::detail::grid_1d::grid_stride();

for (thread_index_type destination_word_index = grid_1d::global_thread_id();
destination_word_index < number_of_mask_words;
Expand Down Expand Up @@ -191,7 +191,7 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
size_type source_end_bit,
size_type number_of_mask_words)
{
thread_index_type const stride = blockDim.x * gridDim.x;
auto const stride = cudf::detail::grid_1d::grid_stride();
for (thread_index_type destination_word_index = grid_1d::global_thread_id();
destination_word_index < number_of_mask_words;
destination_word_index += stride) {
Expand Down Expand Up @@ -265,7 +265,7 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask,
auto const first_word_index{word_index(first_bit_index)};
auto const last_word_index{word_index(last_bit_index)};
thread_index_type const tid = grid_1d::global_thread_id();
thread_index_type const stride = blockDim.x * gridDim.x;
thread_index_type const stride = grid_1d::grid_stride();
thread_index_type thread_word_index = tid + first_word_index;
size_type thread_count{0};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
MapIterator scatter_map,
size_type num_scatter_rows)
{
thread_index_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
auto row = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

while (row < num_scatter_rows) {
size_type const output_row = scatter_map[row];
Expand Down
18 changes: 8 additions & 10 deletions cpp/src/partitioning/partitioning.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
// Accumulate histogram of the size of each partition in shared memory
extern __shared__ size_type shared_partition_sizes[];

auto tid = cudf::thread_index_type{threadIdx.x} +
cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
auto tid = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

// Initialize local histogram
size_type partition_number = threadIdx.x;
Expand All @@ -160,7 +160,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
row_partition_offset[row_number] =
atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));

tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
tid += stride;
}

__syncthreads();
Expand Down Expand Up @@ -215,8 +215,8 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
}
__syncthreads();

auto tid = cudf::thread_index_type{threadIdx.x} +
cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
auto tid = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

// Get each row's partition number, and get it's output location by
// incrementing block's offset counter for that partition number
Expand All @@ -234,7 +234,7 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
// Store the row's output location in-place
row_partition_numbers[row_number] = row_output_location;

tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
tid += stride;
}
}

Expand Down Expand Up @@ -311,10 +311,8 @@ __global__ void copy_block_partitions(InputIter input_iter,
__syncthreads();

// Fetch the input data to shared memory
for (auto tid = cudf::thread_index_type{threadIdx.x} +
cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
tid < num_rows;
tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}) {
for (auto tid = cudf::detail::grid_1d::global_thread_id(); tid < num_rows;
tid += cudf::detail::grid_1d::grid_stride()) {
auto const row_number = static_cast<size_type>(tid);
size_type const ipartition = row_partition_numbers[row_number];

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
char* chars,
cudf::size_type* valid_counter)
{
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;
cudf::size_type nrows = input.size();
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

uint32_t active_mask = 0xffff'ffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down Expand Up @@ -117,9 +117,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
cudf::mutable_column_device_view output,
cudf::size_type* output_valid_count)
{
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;
cudf::size_type nrows = input.size();
auto i = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();

uint32_t active_mask = 0xffff'ffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,8 @@ __launch_bounds__(max_block_size) __global__

auto thread_intermediate_storage =
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
auto const start_idx =
static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
auto start_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = cudf::detail::grid_1d::grid_stride();
auto evaluator =
cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

Expand Down
32 changes: 32 additions & 0 deletions java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package ai.rapids.cudf;

/**
* Represents some amount of host memory that has been reserved. A reservation guarantees that one
* or more allocations up to the reserved amount, minus padding for alignment will succeed. A
* reservation typically guarantees the amount can be allocated one, meaning when a buffer
* allocated from a reservation is freed it is not returned to the reservation, but to the pool of
* memory the reservation originally came from. If more memory is allocated from the reservation
* an OutOfMemoryError may be thrown, but it is not guaranteed to happen.
*
* When the reservation is closed any unused reservation will be returned to the pool of memory
* the reservation came from.
*/
public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {}
Loading

0 comments on commit 2a7ebd1

Please sign in to comment.