rapidsai · rapids-bot · Mar 18, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
@@ -156,6 +156,14 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
     bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
     bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
     bench/prims/neighbors/refine_float_int64_t.cu
     bench/prims/neighbors/refine_uint8_t_int64_t.cu
     bench/prims/main.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#undef RAFT_EXPLICIT_INSTANTIATE_ONLY  // Enable instantiation of search with filter
 #include "../knn.cuh"
 
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
 namespace raft::bench::spatial {
 
 KNN_REGISTER(float, int64_t, ivf_pq_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull);

@@ -28,6 +28,7 @@
 #include <string>
 #include <type_traits>
 #include <unordered_map>
+#include <vector>
 
 namespace raft::common::nvtx::detail {
 

@@ -61,6 +61,8 @@ namespace raft::neighbors::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
+using internal_extents_t = int64_t;  // The default mdspan extent type used internally.
+
 template <uint32_t BlockDim, typename T, typename S>
 __launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
@@ -442,15 +444,16 @@ void train_per_subset(raft::resources const& handle,
                  stream);
 
     // train PQ codebook for this subspace
-    auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
-    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+    auto sub_trainset_view = raft::make_device_matrix_view<const float, internal_extents_t>(
+      sub_trainset.data(), n_rows, index.pq_len());
+    auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
-    auto cluster_sizes_view =
-      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+    auto sub_labels_view =
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(sub_labels.data(), n_rows);
+    auto cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
+      pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = kmeans_n_iters;
     kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -525,17 +528,17 @@ void train_per_cluster(raft::resources const& handle,
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
-    auto rot_vectors_view = raft::make_device_matrix_view<const float, IdxT>(
+    auto rot_vectors_view = raft::make_device_matrix_view<const float, internal_extents_t>(
       rot_vectors.data(), pq_n_rows, index.pq_len());
-    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+    auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
       pq_centers_tmp.data() + static_cast<size_t>(index.pq_book_size()) *
                                 static_cast<size_t>(index.pq_len()) * static_cast<size_t>(l),
       index.pq_book_size(),
       index.pq_len());
     auto pq_labels_view =
-      raft::make_device_vector_view<uint32_t, IdxT>(pq_labels.data(), pq_n_rows);
-    auto pq_cluster_sizes_view =
-      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(pq_labels.data(), pq_n_rows);
+    auto pq_cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
+      pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = kmeans_n_iters;
     kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -1587,11 +1590,11 @@ void extend(raft::resources const& handle,
                                     cudaMemcpyDefault,
                                     stream));
     for (const auto& batch : vec_batches) {
-      auto batch_data_view =
-        raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), index->dim());
-      auto batch_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(
+      auto batch_data_view = raft::make_device_matrix_view<const T, internal_extents_t>(
+        batch.data(), batch.size(), index->dim());
+      auto batch_labels_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
         new_data_labels.data() + batch.offset(), batch.size());
-      auto centers_view = raft::make_device_matrix_view<const float, IdxT>(
+      auto centers_view = raft::make_device_matrix_view<const float, internal_extents_t>(
         cluster_centers.data(), n_clusters, index->dim());
       raft::cluster::kmeans_balanced_params kmeans_params;
       kmeans_params.metric = index->metric();
@@ -1767,10 +1770,10 @@ auto build(raft::resources const& handle,
     auto cluster_centers = cluster_centers_buf.data();
 
     // Train balanced hierarchical kmeans clustering
-    auto trainset_const_view =
-      raft::make_device_matrix_view<const float, IdxT>(trainset.data(), n_rows_train, index.dim());
-    auto centers_view =
-      raft::make_device_matrix_view<float, IdxT>(cluster_centers, index.n_lists(), index.dim());
+    auto trainset_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
+      trainset.data(), n_rows_train, index.dim());
+    auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
+      cluster_centers, index.n_lists(), index.dim());
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = params.kmeans_n_iters;
     kmeans_params.metric  = index.metric();
@@ -1779,9 +1782,10 @@ auto build(raft::resources const& handle,
 
     // Trainset labels are needed for training PQ codebooks
     rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
-    auto centers_const_view = raft::make_device_matrix_view<const float, IdxT>(
+    auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
       cluster_centers, index.n_lists(), index.dim());
-    auto labels_view = raft::make_device_vector_view<uint32_t, IdxT>(labels.data(), n_rows_train);
+    auto labels_view =
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
     raft::cluster::kmeans_balanced::predict(handle,
                                             kmeans_params,
                                             trainset_const_view,

@@ -0,0 +1,71 @@
+
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is to be used in source files generated by
+ * src/neighbors/detailivf_pq_compute_similarity_00_generate.py
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+#include <raft/neighbors/sample_filter.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
+  OutT, LutT, IvfSampleFilterT)                                                             \
+  template auto                                                                             \
+  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
+    const cudaDeviceProp& dev_props,                                                        \
+    bool manage_local_topk,                                                                 \
+    int locality_hint,                                                                      \
+    double preferred_shmem_carveout,                                                        \
+    uint32_t pq_bits,                                                                       \
+    uint32_t pq_dim,                                                                        \
+    uint32_t precomp_data_count,                                                            \
+    uint32_t n_queries,                                                                     \
+    uint32_t n_probes,                                                                      \
+    uint32_t topk)                                                                          \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
+                                                                                            \
+  template void                                                                             \
+  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
+    rmm::cuda_stream_view stream,                                                           \
+    uint32_t dim,                                                                           \
+    uint32_t n_probes,                                                                      \
+    uint32_t pq_dim,                                                                        \
+    uint32_t n_queries,                                                                     \
+    uint32_t queries_offset,                                                                \
+    raft::distance::DistanceType metric,                                                    \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
+    uint32_t topk,                                                                          \
+    uint32_t max_samples,                                                                   \
+    const float* cluster_centers,                                                           \
+    const float* pq_centers,                                                                \
+    const uint8_t* const* pq_dataset,                                                       \
+    const uint32_t* cluster_labels,                                                         \
+    const uint32_t* _chunk_indices,                                                         \
+    const float* queries,                                                                   \
+    const uint32_t* index_list,                                                             \
+    float* query_kths,                                                                      \
+    IvfSampleFilterT sample_filter,                                                         \
+    LutT* lut_scores,                                                                       \
+    OutT* _out_scores,                                                                      \
+    uint32_t* _out_indices);
+
+#define COMMA ,