Skip to content

Commit

Permalink
Add explicit instantiations for IVF-PQ search kernels used in tests (#…
Browse files Browse the repository at this point in the history
…2212)

Compilation of IVF-PQ search kernels can be time consuming. In `libraft.so` the compilation is done in parallel for kernels without filtering and with `int64_t` index type.

We have test with `uint32_t` index type as well as tests for `bitset_filter` with both 32 and 64 bit index types. This PR adds explicit template instantiations for the test. This way we avoid repeated compilation of the kernels with filter and this also enables parallel compilation of the `compute_similarity` kernel for different template types. The kernels with these additional type parameters are not added to `libraft.so`, only linked together with the test executable. 

Note that this PR does not increase the number of compiled kernels, but it enables to compile them in parallel.

Authors:
  - Tamas Bela Feher (https://github.com/tfeher)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Ben Frederickson (https://github.com/benfred)

URL: #2212
  • Loading branch information
tfeher authored Mar 18, 2024
1 parent 32f6f40 commit d14cac2
Show file tree
Hide file tree
Showing 45 changed files with 1,245 additions and 486 deletions.
8 changes: 8 additions & 0 deletions cpp/bench/prims/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ if(BUILD_PRIMS_BENCH)
bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
bench/prims/neighbors/refine_float_int64_t.cu
bench/prims/neighbors/refine_uint8_t_int64_t.cu
bench/prims/main.cpp
Expand Down
5 changes: 3 additions & 2 deletions cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,9 +14,10 @@
* limitations under the License.
*/

#undef RAFT_EXPLICIT_INSTANTIATE_ONLY // Enable instantiation of search with filter
#include "../knn.cuh"

#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
namespace raft::bench::spatial {

KNN_REGISTER(float, int64_t, ivf_pq_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull);
Expand Down
1 change: 1 addition & 0 deletions cpp/include/raft/core/detail/nvtx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <string>
#include <type_traits>
#include <unordered_map>
#include <vector>

namespace raft::common::nvtx::detail {

Expand Down
46 changes: 25 additions & 21 deletions cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ namespace raft::neighbors::ivf_pq::detail {

using namespace raft::spatial::knn::detail; // NOLINT

using internal_extents_t = int64_t; // The default mdspan extent type used internally.

template <uint32_t BlockDim, typename T, typename S>
__launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel(
T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
Expand Down Expand Up @@ -442,15 +444,16 @@ void train_per_subset(raft::resources const& handle,
stream);

// train PQ codebook for this subspace
auto sub_trainset_view =
raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
auto sub_trainset_view = raft::make_device_matrix_view<const float, internal_extents_t>(
sub_trainset.data(), n_rows, index.pq_len());
auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
index.pq_book_size(),
index.pq_len());
auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
auto cluster_sizes_view =
raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
auto sub_labels_view =
raft::make_device_vector_view<uint32_t, internal_extents_t>(sub_labels.data(), n_rows);
auto cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
pq_cluster_sizes.data(), index.pq_book_size());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.n_iters = kmeans_n_iters;
kmeans_params.metric = raft::distance::DistanceType::L2Expanded;
Expand Down Expand Up @@ -525,17 +528,17 @@ void train_per_cluster(raft::resources const& handle,
size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
auto pq_n_rows = uint32_t(std::min(big_enough, available_rows));
// train PQ codebook for this cluster
auto rot_vectors_view = raft::make_device_matrix_view<const float, IdxT>(
auto rot_vectors_view = raft::make_device_matrix_view<const float, internal_extents_t>(
rot_vectors.data(), pq_n_rows, index.pq_len());
auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
pq_centers_tmp.data() + static_cast<size_t>(index.pq_book_size()) *
static_cast<size_t>(index.pq_len()) * static_cast<size_t>(l),
index.pq_book_size(),
index.pq_len());
auto pq_labels_view =
raft::make_device_vector_view<uint32_t, IdxT>(pq_labels.data(), pq_n_rows);
auto pq_cluster_sizes_view =
raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
raft::make_device_vector_view<uint32_t, internal_extents_t>(pq_labels.data(), pq_n_rows);
auto pq_cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
pq_cluster_sizes.data(), index.pq_book_size());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.n_iters = kmeans_n_iters;
kmeans_params.metric = raft::distance::DistanceType::L2Expanded;
Expand Down Expand Up @@ -1587,11 +1590,11 @@ void extend(raft::resources const& handle,
cudaMemcpyDefault,
stream));
for (const auto& batch : vec_batches) {
auto batch_data_view =
raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), index->dim());
auto batch_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(
auto batch_data_view = raft::make_device_matrix_view<const T, internal_extents_t>(
batch.data(), batch.size(), index->dim());
auto batch_labels_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
new_data_labels.data() + batch.offset(), batch.size());
auto centers_view = raft::make_device_matrix_view<const float, IdxT>(
auto centers_view = raft::make_device_matrix_view<const float, internal_extents_t>(
cluster_centers.data(), n_clusters, index->dim());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.metric = index->metric();
Expand Down Expand Up @@ -1767,10 +1770,10 @@ auto build(raft::resources const& handle,
auto cluster_centers = cluster_centers_buf.data();

// Train balanced hierarchical kmeans clustering
auto trainset_const_view =
raft::make_device_matrix_view<const float, IdxT>(trainset.data(), n_rows_train, index.dim());
auto centers_view =
raft::make_device_matrix_view<float, IdxT>(cluster_centers, index.n_lists(), index.dim());
auto trainset_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
trainset.data(), n_rows_train, index.dim());
auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
cluster_centers, index.n_lists(), index.dim());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.n_iters = params.kmeans_n_iters;
kmeans_params.metric = index.metric();
Expand All @@ -1779,9 +1782,10 @@ auto build(raft::resources const& handle,

// Trainset labels are needed for training PQ codebooks
rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
auto centers_const_view = raft::make_device_matrix_view<const float, IdxT>(
auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
cluster_centers, index.n_lists(), index.dim());
auto labels_view = raft::make_device_vector_view<uint32_t, IdxT>(labels.data(), n_rows_train);
auto labels_view =
raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
raft::cluster::kmeans_balanced::predict(handle,
kmeans_params,
trainset_const_view,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@

/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* NOTE: this file is to be used in source files generated by
* src/neighbors/detailivf_pq_compute_similarity_00_generate.py
*/

#pragma once

#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
#include <raft/neighbors/sample_filter.cuh>

#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \
OutT, LutT, IvfSampleFilterT) \
template auto \
raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
const cudaDeviceProp& dev_props, \
bool manage_local_topk, \
int locality_hint, \
double preferred_shmem_carveout, \
uint32_t pq_bits, \
uint32_t pq_dim, \
uint32_t precomp_data_count, \
uint32_t n_queries, \
uint32_t n_probes, \
uint32_t topk) \
->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>; \
\
template void \
raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>( \
raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s, \
rmm::cuda_stream_view stream, \
uint32_t dim, \
uint32_t n_probes, \
uint32_t pq_dim, \
uint32_t n_queries, \
uint32_t queries_offset, \
raft::distance::DistanceType metric, \
raft::neighbors::ivf_pq::codebook_gen codebook_kind, \
uint32_t topk, \
uint32_t max_samples, \
const float* cluster_centers, \
const float* pq_centers, \
const uint8_t* const* pq_dataset, \
const uint32_t* cluster_labels, \
const uint32_t* _chunk_indices, \
const float* queries, \
const uint32_t* index_list, \
float* query_kths, \
IvfSampleFilterT sample_filter, \
LutT* lut_scores, \
OutT* _out_scores, \
uint32_t* _out_indices);

#define COMMA ,
Loading

0 comments on commit d14cac2

Please sign in to comment.