Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add explicit instantiations for IVF-PQ search kernels used in tests #2212

Merged
merged 14 commits into from
Mar 18, 2024
Merged
8 changes: 8 additions & 0 deletions cpp/bench/prims/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ if(BUILD_PRIMS_BENCH)
bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
bench/prims/neighbors/refine_float_int64_t.cu
bench/prims/neighbors/refine_uint8_t_int64_t.cu
bench/prims/main.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,9 +14,10 @@
* limitations under the License.
*/

#undef RAFT_EXPLICIT_INSTANTIATE_ONLY // Enable instantiation of search with filter
#include "../knn.cuh"

#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
namespace raft::bench::spatial {

KNN_REGISTER(float, int64_t, ivf_pq_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull);
Expand Down
1 change: 1 addition & 0 deletions cpp/include/raft/core/detail/nvtx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <string>
#include <type_traits>
#include <unordered_map>
#include <vector>

namespace raft::common::nvtx::detail {

Expand Down
46 changes: 25 additions & 21 deletions cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ namespace raft::neighbors::ivf_pq::detail {

using namespace raft::spatial::knn::detail; // NOLINT

using internal_extents_t = int64_t; // The default mdspan extent type used internally.

template <uint32_t BlockDim, typename T, typename S>
__launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel(
T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
Expand Down Expand Up @@ -442,15 +444,16 @@ void train_per_subset(raft::resources const& handle,
stream);

// train PQ codebook for this subspace
auto sub_trainset_view =
raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
auto sub_trainset_view = raft::make_device_matrix_view<const float, internal_extents_t>(
sub_trainset.data(), n_rows, index.pq_len());
auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
index.pq_book_size(),
index.pq_len());
auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
auto cluster_sizes_view =
raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
auto sub_labels_view =
raft::make_device_vector_view<uint32_t, internal_extents_t>(sub_labels.data(), n_rows);
auto cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
pq_cluster_sizes.data(), index.pq_book_size());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.n_iters = kmeans_n_iters;
kmeans_params.metric = raft::distance::DistanceType::L2Expanded;
Expand Down Expand Up @@ -525,17 +528,17 @@ void train_per_cluster(raft::resources const& handle,
size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
auto pq_n_rows = uint32_t(std::min(big_enough, available_rows));
// train PQ codebook for this cluster
auto rot_vectors_view = raft::make_device_matrix_view<const float, IdxT>(
auto rot_vectors_view = raft::make_device_matrix_view<const float, internal_extents_t>(
rot_vectors.data(), pq_n_rows, index.pq_len());
auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
pq_centers_tmp.data() + static_cast<size_t>(index.pq_book_size()) *
static_cast<size_t>(index.pq_len()) * static_cast<size_t>(l),
index.pq_book_size(),
index.pq_len());
auto pq_labels_view =
raft::make_device_vector_view<uint32_t, IdxT>(pq_labels.data(), pq_n_rows);
auto pq_cluster_sizes_view =
raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
raft::make_device_vector_view<uint32_t, internal_extents_t>(pq_labels.data(), pq_n_rows);
auto pq_cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
pq_cluster_sizes.data(), index.pq_book_size());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.n_iters = kmeans_n_iters;
kmeans_params.metric = raft::distance::DistanceType::L2Expanded;
Expand Down Expand Up @@ -1587,11 +1590,11 @@ void extend(raft::resources const& handle,
cudaMemcpyDefault,
stream));
for (const auto& batch : vec_batches) {
auto batch_data_view =
raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), index->dim());
auto batch_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(
auto batch_data_view = raft::make_device_matrix_view<const T, internal_extents_t>(
batch.data(), batch.size(), index->dim());
auto batch_labels_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
new_data_labels.data() + batch.offset(), batch.size());
auto centers_view = raft::make_device_matrix_view<const float, IdxT>(
auto centers_view = raft::make_device_matrix_view<const float, internal_extents_t>(
cluster_centers.data(), n_clusters, index->dim());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.metric = index->metric();
Expand Down Expand Up @@ -1767,10 +1770,10 @@ auto build(raft::resources const& handle,
auto cluster_centers = cluster_centers_buf.data();

// Train balanced hierarchical kmeans clustering
auto trainset_const_view =
raft::make_device_matrix_view<const float, IdxT>(trainset.data(), n_rows_train, index.dim());
auto centers_view =
raft::make_device_matrix_view<float, IdxT>(cluster_centers, index.n_lists(), index.dim());
auto trainset_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
trainset.data(), n_rows_train, index.dim());
auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
cluster_centers, index.n_lists(), index.dim());
raft::cluster::kmeans_balanced_params kmeans_params;
kmeans_params.n_iters = params.kmeans_n_iters;
kmeans_params.metric = index.metric();
Expand All @@ -1779,9 +1782,10 @@ auto build(raft::resources const& handle,

// Trainset labels are needed for training PQ codebooks
rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
auto centers_const_view = raft::make_device_matrix_view<const float, IdxT>(
auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
cluster_centers, index.n_lists(), index.dim());
auto labels_view = raft::make_device_vector_view<uint32_t, IdxT>(labels.data(), n_rows_train);
auto labels_view =
raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
raft::cluster::kmeans_balanced::predict(handle,
kmeans_params,
trainset_const_view,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@

/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* NOTE: this file is to be used in source files generated by
* src/neighbors/detailivf_pq_compute_similarity_00_generate.py
*/

#pragma once

#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
#include <raft/neighbors/sample_filter.cuh>

#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \
OutT, LutT, IvfSampleFilterT) \
template auto \
raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
const cudaDeviceProp& dev_props, \
bool manage_local_topk, \
int locality_hint, \
double preferred_shmem_carveout, \
uint32_t pq_bits, \
uint32_t pq_dim, \
uint32_t precomp_data_count, \
uint32_t n_queries, \
uint32_t n_probes, \
uint32_t topk) \
->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>; \
\
template void \
raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>( \
raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s, \
rmm::cuda_stream_view stream, \
uint32_t dim, \
uint32_t n_probes, \
uint32_t pq_dim, \
uint32_t n_queries, \
uint32_t queries_offset, \
raft::distance::DistanceType metric, \
raft::neighbors::ivf_pq::codebook_gen codebook_kind, \
uint32_t topk, \
uint32_t max_samples, \
const float* cluster_centers, \
const float* pq_centers, \
const uint8_t* const* pq_dataset, \
const uint32_t* cluster_labels, \
const uint32_t* _chunk_indices, \
const float* queries, \
const uint32_t* index_list, \
float* query_kths, \
IvfSampleFilterT sample_filter, \
LutT* lut_scores, \
OutT* _out_scores, \
uint32_t* _out_indices);

#define COMMA ,
Loading
Loading