Skip to content

Commit

Permalink
Add num_multiprocessors utility (rapidsai#16628)
Browse files Browse the repository at this point in the history
This PR introduces a new `num_multiprocessors` utility and moves the existing `elements_per_thread` host utility to the new `cuda.hpp` header. 

Needed by rapidsai#16619.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: rapidsai#16628
  • Loading branch information
PointKernel authored Aug 27, 2024
1 parent f1cc962 commit 2d494ed
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 47 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,7 @@ add_library(
src/unary/math_ops.cu
src/unary/nan_ops.cu
src/unary/null_ops.cu
src/utilities/cuda.cpp
src/utilities/cuda_memcpy.cu
src/utilities/default_stream.cpp
src/utilities/host_memory.cpp
Expand Down
10 changes: 3 additions & 7 deletions cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/cuda.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -150,13 +151,8 @@ void generate_input_tables(key_type* const build_tbl,
CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));

int dev_id{-1};
CUDF_CUDA_TRY(cudaGetDevice(&dev_id));

int num_sms{-1};
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));

int const num_states =
auto const num_sms = cudf::detail::num_multiprocessors();
auto const num_states =
num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
rmm::device_uvector<curandState> devStates(num_states, cudf::get_default_stream());

Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/detail/copy_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/detail/gather.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/cuda.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/table/table.hpp>
Expand Down
29 changes: 0 additions & 29 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -189,35 +189,6 @@ __device__ T single_lane_block_sum_reduce(T lane_value)
return result;
}

/**
* @brief Get the number of elements that can be processed per thread.
*
* @param[in] kernel The kernel for which the elements per thread needs to be assessed
* @param[in] total_size Number of elements
* @param[in] block_size Expected block size
*
* @return cudf::size_type Elements per thread that can be processed for given specification.
*/
template <typename Kernel>
cudf::size_type elements_per_thread(Kernel kernel,
cudf::size_type total_size,
cudf::size_type block_size,
cudf::size_type max_per_thread = 32)
{
CUDF_FUNC_RANGE();

// calculate theoretical occupancy
int max_blocks = 0;
CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));

int device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&device));
int num_sms = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
int per_thread = total_size / (max_blocks * num_sms * block_size);
return std::clamp(per_thread, 1, max_per_thread);
}

/**
* @brief Finds the smallest value not less than `number_to_round` and modulo `modulus` is
* zero. Expects modulus to be a power of 2.
Expand Down
59 changes: 59 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>

#include <algorithm>

namespace CUDF_EXPORT cudf {
namespace detail {

/**
* @brief Get the number of multiprocessors on the device
*/
cudf::size_type num_multiprocessors();

/**
* @brief Get the number of elements that can be processed per thread.
*
* @param[in] kernel The kernel for which the elements per thread needs to be assessed
* @param[in] total_size Number of elements
* @param[in] block_size Expected block size
*
* @return cudf::size_type Elements per thread that can be processed for given specification.
*/
template <typename Kernel>
cudf::size_type elements_per_thread(Kernel kernel,
cudf::size_type total_size,
cudf::size_type block_size,
cudf::size_type max_per_thread = 32)
{
CUDF_FUNC_RANGE();

// calculate theoretical occupancy
int max_blocks = 0;
CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));

int per_thread = total_size / (max_blocks * num_multiprocessors() * block_size);
return std::clamp(per_thread, 1, max_per_thread);
}

} // namespace detail
} // namespace CUDF_EXPORT cudf
18 changes: 7 additions & 11 deletions cpp/src/io/comp/debrotli.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ THE SOFTWARE.
#include "gpuinflate.hpp"
#include "io/utilities/block_utils.cuh"

#include <cudf/detail/utilities/cuda.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -2047,19 +2048,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 2)
*/
size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
{
int sm_count = 0;
int dev = 0;
uint32_t max_fb_size, min_fb_size, fb_size;
CUDF_CUDA_TRY(cudaGetDevice(&dev));
if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) {
// printf("%d SMs on device %d\n", sm_count, dev);
max_num_inputs =
min(max_num_inputs, sm_count * 3); // no more than 3 blocks/sm at most due to 32KB smem use
if (max_num_inputs <= 0) {
max_num_inputs = sm_count * 2; // Target 2 blocks/SM by default for scratch mem computation
}
auto const sm_count = cudf::detail::num_multiprocessors();
// no more than 3 blocks/sm at most due to 32KB smem use
max_num_inputs = std::min(max_num_inputs, sm_count * 3);
if (max_num_inputs <= 0) {
max_num_inputs = sm_count * 2; // Target 2 blocks/SM by default for scratch mem computation
}
max_num_inputs = min(max(max_num_inputs, 1), 512);
max_num_inputs = std::min(std::max(max_num_inputs, 1), 512);
// Max fb size per block occurs if all huffman tables for all 3 group types fail local_alloc()
// with num_htrees=256 (See HuffmanTreeGroupAlloc)
max_fb_size = 256 * (630 + 1080 + 920) * 2; // 1.3MB
Expand Down
34 changes: 34 additions & 0 deletions cpp/src/utilities/cuda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/utilities/cuda.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>

#include <cuda_runtime.h>

namespace cudf::detail {

cudf::size_type num_multiprocessors()
{
int device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&device));
int num_sms = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
return num_sms;
}

} // namespace cudf::detail

0 comments on commit 2d494ed

Please sign in to comment.