Skip to content

Commit

Permalink
Mark all CUDA kernel launches as hidden
Browse files Browse the repository at this point in the history
Effect on binary size of libraft.a

23.12: 133361630
pr: 129748904

Effect on binary size of libraft.so

23.12: 83603224
pr: 83873088
  • Loading branch information
robertmaynard committed Oct 12, 2023
1 parent 1a660d2 commit e630d0f
Show file tree
Hide file tree
Showing 146 changed files with 303 additions and 309 deletions.
2 changes: 1 addition & 1 deletion cpp/bench/prims/distance/masked_nn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ struct Params {
AdjacencyPattern pattern;
}; // struct Params

__global__ void init_adj(AdjacencyPattern pattern,
RAFT_KERNEL init_adj(AdjacencyPattern pattern,
int n,
raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
Expand Down
2 changes: 1 addition & 1 deletion cpp/bench/prims/sparse/convert_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct bench_param {
};

template <typename index_t>
__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
RAFT_KERNEL init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
{
index_t r = blockDim.y * blockIdx.y + threadIdx.y;
index_t c = blockDim.x * blockIdx.x + threadIdx.x;
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/cluster/detail/agglomerative.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ void build_dendrogram_host(raft::resources const& handle,
}

template <typename value_idx>
__global__ void write_levels_kernel(const value_idx* children,
RAFT_KERNEL write_levels_kernel(const value_idx* children,
value_idx* parents,
value_idx n_vertices)
{
Expand All @@ -179,7 +179,7 @@ __global__ void write_levels_kernel(const value_idx* children,
* @param labels
*/
template <typename value_idx>
__global__ void inherit_labels(const value_idx* children,
RAFT_KERNEL inherit_labels(const value_idx* children,
const value_idx* levels,
std::size_t n_leaves,
value_idx* labels,
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/cluster/detail/connectivities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
};

template <typename value_idx>
__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz)
RAFT_KERNEL fill_indices2(value_idx* indices, size_t m, size_t nnz)
{
value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nnz) return;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/cluster/detail/kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ template <uint32_t BlockDimY,
typename LabelT,
typename CounterT,
typename MappingOpT>
__global__ void __launch_bounds__((WarpSize * BlockDimY))
__launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL
adjust_centers_kernel(MathT* centers, // [n_clusters, dim]
IdxT n_clusters,
IdxT dim,
Expand Down
10 changes: 5 additions & 5 deletions cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
* initialized to zero.
*/
template <typename index_type_t, typename value_type_t>
static __global__ void computeDistances(index_type_t n,
RAFT_KERNEL computeDistances(index_type_t n,
index_type_t d,
index_type_t k,
const value_type_t* __restrict__ obs,
Expand Down Expand Up @@ -173,7 +173,7 @@ static __global__ void computeDistances(index_type_t n,
* cluster. Entries must be initialized to zero.
*/
template <typename index_type_t, typename value_type_t>
static __global__ void minDistances(index_type_t n,
RAFT_KERNEL minDistances(index_type_t n,
index_type_t k,
value_type_t* __restrict__ dists,
index_type_t* __restrict__ codes,
Expand Down Expand Up @@ -233,7 +233,7 @@ static __global__ void minDistances(index_type_t n,
* @param code_new Index associated with new centroid.
*/
template <typename index_type_t, typename value_type_t>
static __global__ void minDistances2(index_type_t n,
RAFT_KERNEL minDistances2(index_type_t n,
value_type_t* __restrict__ dists_old,
const value_type_t* __restrict__ dists_new,
index_type_t* __restrict__ codes_old,
Expand Down Expand Up @@ -275,7 +275,7 @@ static __global__ void minDistances2(index_type_t n,
* cluster. Entries must be initialized to zero.
*/
template <typename index_type_t>
static __global__ void computeClusterSizes(index_type_t n,
RAFT_KERNEL computeClusterSizes(index_type_t n,
const index_type_t* __restrict__ codes,
index_type_t* __restrict__ clusterSizes)
{
Expand Down Expand Up @@ -308,7 +308,7 @@ static __global__ void computeClusterSizes(index_type_t n,
* column is the mean position of a cluster).
*/
template <typename index_type_t, typename value_type_t>
static __global__ void divideCentroids(index_type_t d,
RAFT_KERNEL divideCentroids(index_type_t d,
index_type_t k,
const index_type_t* __restrict__ clusterSizes,
value_type_t* __restrict__ centroids)
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/common/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
namespace raft::detail {

template <typename DataT, int VecLen, typename Lambda, typename IdxT>
__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
RAFT_KERNEL scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
{
typedef TxN_t<DataT, VecLen> DataVec;
typedef TxN_t<IdxT, VecLen> IdxVec;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/core/detail/copy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ __device__ auto increment_indices(IdxType* indices,
* parameters.
*/
template <typename DstType, typename SrcType>
__global__ mdspan_copyable_with_kernel_t<DstType, SrcType> mdspan_copy_kernel(DstType dst,
RAFT_KERNEL mdspan_copyable_with_kernel_t<DstType, SrcType> mdspan_copy_kernel(DstType dst,
SrcType src)
{
using config = mdspan_copyable<true, DstType, SrcType>;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/compress_to_bits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace raft::distance::detail {
* Note: the division (`/`) is a ceilDiv.
*/
template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
__global__ void compress_to_bits_kernel(
RAFT_KERNEL compress_to_bits_kernel(
raft::device_matrix_view<const bool, int, raft::layout_c_contiguous> in,
raft::device_matrix_view<T, int, raft::layout_c_contiguous> out)
{
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/distance/detail/fused_l2_nn.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct MinReduceOpImpl {
};

template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
RAFT_KERNEL initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
{
auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < m) { redOp.init(min + tid, maxVal); }
Expand Down Expand Up @@ -139,7 +139,7 @@ template <typename DataT,
typename KVPReduceOpT,
typename OpT,
typename FinalLambda>
__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min,
const DataT* x,
const DataT* y,
const DataT* xn,
Expand Down
10 changes: 5 additions & 5 deletions cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace raft::distance::kernels::detail {
* @param offset
*/
template <typename math_t, typename exp_t>
__global__ void polynomial_kernel_nopad(
RAFT_KERNEL polynomial_kernel_nopad(
math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
{
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
Expand All @@ -56,7 +56,7 @@ __global__ void polynomial_kernel_nopad(
* @param offset
*/
template <typename math_t, typename exp_t>
__global__ void polynomial_kernel(
RAFT_KERNEL polynomial_kernel(
math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
{
for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
Expand All @@ -75,7 +75,7 @@ __global__ void polynomial_kernel(
* @param offset
*/
template <typename math_t>
__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
{
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
tid += blockDim.x * gridDim.x) {
Expand All @@ -93,7 +93,7 @@ __global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t
* @param offset
*/
template <typename math_t>
__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
{
for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
tidy += blockDim.y * gridDim.y)
Expand Down Expand Up @@ -121,7 +121,7 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
* @param gain
*/
template <typename math_t>
__global__ void rbf_kernel_expanded(
RAFT_KERNEL rbf_kernel_expanded(
math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
{
for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/masked_nn.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ template <typename DataT,
typename KVPReduceOpT,
typename CoreLambda,
typename FinalLambda>
__global__ __launch_bounds__(P::Nthreads, 2) void masked_l2_nn_kernel(OutT* min,
__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL masked_l2_nn_kernel(OutT* min,
const DataT* x,
const DataT* y,
const DataT* xn,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ template <typename Policy,
typename DataT,
typename OutT,
typename FinOpT>
__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(
__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL pairwise_matrix_kernel(
OpT distance_op, pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params)
{
// Early exit to minimize the size of the kernel when it is not supposed to be compiled.
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/label/detail/classlabels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ void getOvrlabels(
// +/-1, return array with the new class labels and corresponding indices.

template <typename Type, int TPB_X, typename Lambda>
__global__ void map_label_kernel(Type* map_ids,
RAFT_KERNEL map_label_kernel(Type* map_ids,
size_t N_labels,
Type* in,
Type* out,
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/label/detail/merge_labels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace detail {
* For an additional cost we can build the graph with edges
* E={(A[i], B[i]) | M[i]=1} and make this step faster */
template <typename value_idx, int TPB_X = 256>
__global__ void __launch_bounds__(TPB_X)
RAFT_KERNEL __launch_bounds__(TPB_X)
propagate_label_kernel(const value_idx* __restrict__ labels_a,
const value_idx* __restrict__ labels_b,
value_idx* __restrict__ R,
Expand Down Expand Up @@ -65,7 +65,7 @@ __global__ void __launch_bounds__(TPB_X)
}

template <typename value_idx, int TPB_X = 256>
__global__ void __launch_bounds__(TPB_X)
RAFT_KERNEL __launch_bounds__(TPB_X)
reassign_label_kernel(value_idx* __restrict__ labels_a,
const value_idx* __restrict__ labels_b,
const value_idx* __restrict__ R,
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/linalg/detail/add.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st
}

template <class InT, typename IdxType, typename OutT = InT>
__global__ void add_dev_scalar_kernel(OutT* outDev,
RAFT_KERNEL add_dev_scalar_kernel(OutT* outDev,
const InT* inDev,
const InT* singleScalarDev,
IdxType len)
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ template <typename Policy,
typename MainLambda,
typename ReduceLambda,
typename FinalLambda>
__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
coalescedReductionThinKernel(OutType* dots,
const InType* data,
IdxType D,
Expand Down Expand Up @@ -137,7 +137,7 @@ template <int TPB,
typename MainLambda,
typename ReduceLambda,
typename FinalLambda>
__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
const InType* data,
IdxType D,
IdxType N,
Expand Down Expand Up @@ -225,7 +225,7 @@ template <typename Policy,
typename IdxType,
typename MainLambda,
typename ReduceLambda>
__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
coalescedReductionThickKernel(OutType* buffer,
const InType* data,
IdxType D,
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/linalg/detail/map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ __device__ __forceinline__ void map_kernel_mainloop(
}

template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
RAFT_KERNEL map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
{
const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
if constexpr (R <= 1) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/linalg/detail/map_then_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ template <typename InType,
typename ReduceLambda,
int TPB,
typename... Args>
__global__ void mapThenReduceKernel(OutType* out,
RAFT_KERNEL mapThenReduceKernel(OutType* out,
IdxType len,
OutType neutral,
MapOp map,
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/linalg/detail/normalize.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ template <typename Policy,
typename MainLambda,
typename ReduceLambda,
typename FinalLambda>
__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
coalesced_normalize_thin_kernel(Type* out,
const Type* in,
IdxType D,
Expand Down Expand Up @@ -92,7 +92,7 @@ template <int TPB,
typename MainLambda,
typename ReduceLambda,
typename FinalLambda>
__global__ void __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out,
RAFT_KERNEL __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out,
const Type* in,
IdxType D,
IdxType N,
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace detail {
///@todo: specialize this to support shared-mem based atomics

template <typename T, typename KeyIteratorT, typename IdxType>
__global__ void reduce_cols_by_key_direct_kernel(
RAFT_KERNEL reduce_cols_by_key_direct_kernel(
const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
{
typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
Expand All @@ -44,7 +44,7 @@ __global__ void reduce_cols_by_key_direct_kernel(
}

template <typename T, typename KeyIteratorT, typename IdxType>
__global__ void reduce_cols_by_key_cached_kernel(
RAFT_KERNEL reduce_cols_by_key_cached_kernel(
const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
{
typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
Expand Down
8 changes: 4 additions & 4 deletions cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ namespace detail {
//

template <typename IteratorT1, typename IteratorT2>
void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
RAFT_KERNEL convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
{
for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) {
dst[idx] = src[idx];
Expand Down Expand Up @@ -95,7 +95,7 @@ struct quadSum {
template <typename DataIteratorT, typename WeightT, typename SumsT, typename IdxT>
__launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)

__global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
RAFT_KERNEL sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
IdxT lda,
const char* d_keys,
const WeightT* d_weights,
Expand Down Expand Up @@ -193,7 +193,7 @@ template <typename DataIteratorT,
typename WeightT,
typename SumsT,
typename IdxT>
__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
IdxT lda,
KeysIteratorT d_keys,
const WeightT* d_weights,
Expand Down Expand Up @@ -269,7 +269,7 @@ template <typename DataIteratorT,
typename WeightT,
typename SumsT,
typename IdxT>
__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
IdxT lda,
const WeightT* d_weights,
KeysIteratorT d_keys,
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/linalg/detail/strided_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace detail {
// of the matrix, i.e. reduce along columns for row major or reduce along rows
// for column major layout
template <typename Type, typename MainLambda>
__global__ void stridedSummationKernel(
RAFT_KERNEL stridedSummationKernel(
Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
{
// Thread reduction
Expand Down Expand Up @@ -68,7 +68,7 @@ template <typename InType,
typename IdxType,
typename MainLambda,
typename ReduceLambda>
__global__ void stridedReductionKernel(OutType* dots,
RAFT_KERNEL stridedReductionKernel(OutType* dots,
const InType* data,
int D,
int N,
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/linalg/detail/subtract.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream
}

template <class math_t, typename IdxType>
__global__ void subtract_dev_scalar_kernel(math_t* outDev,
RAFT_KERNEL subtract_dev_scalar_kernel(math_t* outDev,
const math_t* inDev,
const math_t* singleScalarDev,
IdxType len)
Expand Down
Loading

0 comments on commit e630d0f

Please sign in to comment.