Fixes to new YAML config raft-bench-ann (#1945)

Authors: - Divye Gala (https://github.com/divyegala) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #1945
rapidsai · Nov 1, 2023 · 67a796c · 67a796c
1 parent d504795
commit 67a796c
Show file tree

Hide file tree

Showing 19 changed files with 163 additions and 106 deletions.
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
@@ -120,7 +120,7 @@ class ANN : public AnnBase {
   // The advantage of this way is that index has smaller size
   // and many indices can share one dataset.
   //
-  // AlgoProperty::need_dataset_when_search of such algorithm should be true,
+  // SearchParam::needs_dataset() of such algorithm should be true,
   // and set_search_dataset() should save the passed-in pointer somewhere.
   // The client code should call set_search_dataset() before searching,
   // and should not release dataset before searching is finished.

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
@@ -215,11 +215,13 @@ void bench_search(::benchmark::State& state,
       search_param->metric_objective = metric_objective;
     } catch (const std::exception& e) {
       state.SkipWithError("Failed to create an algo: " + std::string(e.what()));
+      return;
     }
-    algo->set_search_param(*search_param);
+
     auto algo_property = parse_algo_property(algo->get_preference(), sp_json);
     current_algo_props = std::make_shared<AlgoProperty>(algo_property.dataset_memory_type,
                                                         algo_property.query_memory_type);
+
     if (search_param->needs_dataset()) {
       try {
         algo->set_search_dataset(dataset->base_set(current_algo_props->dataset_memory_type),
@@ -231,6 +233,14 @@ void bench_search(::benchmark::State& state,
         return;
       }
     }
+
+    try {
+      algo->set_search_param(*search_param);
+
+    } catch (const std::exception& ex) {
+      state.SkipWithError("An error occurred setting search parameters: " + std::string(ex.what()));
+      return;
+    }
   }
 
   const auto algo_property = *current_algo_props;

diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
@@ -49,7 +49,7 @@ void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissCpuIVFPQ<T>::BuildParam& param)
 {
   parse_base_build_param<T>(conf, param);
-  param.M_ratio = conf.at("M_ratio");
+  param.M = conf.at("M");
   if (conf.contains("usePrecomputed")) {
     param.usePrecomputed = conf.at("usePrecomputed");
   } else {

diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -152,6 +152,7 @@ void FaissCpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
   index_->train(nrow, dataset);  // faiss::IndexFlat::train() will do nothing
   assert(index_->is_trained);
   index_->add(nrow, dataset);
+  index_refine_ = std::make_unique<faiss::IndexRefineFlat>(this->index_.get(), dataset);
 }
 
 template <typename T>
@@ -163,7 +164,6 @@ void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
   dynamic_cast<faiss::IndexIVF*>(index_.get())->nprobe = nprobe;
 
   if (search_param.refine_ratio > 1.0) {
-    this->index_refine_ = std::make_unique<faiss::IndexRefineFlat>(this->index_.get());
     this->index_refine_.get()->k_factor = search_param.refine_ratio;
   }
 
@@ -229,20 +229,16 @@ template <typename T>
 class FaissCpuIVFPQ : public FaissCpu<T> {
  public:
   struct BuildParam : public FaissCpu<T>::BuildParam {
-    int M_ratio;
+    int M;
     int bitsPerCode;
     bool usePrecomputed;
   };
 
   FaissCpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissCpu<T>(metric, dim, param)
   {
     this->init_quantizer(dim);
-    this->index_ = std::make_unique<faiss::IndexIVFPQ>(this->quantizer_.get(),
-                                                       dim,
-                                                       param.nlist,
-                                                       dim / param.M_ratio,
-                                                       param.bitsPerCode,
-                                                       this->metric_type_);
+    this->index_ = std::make_unique<faiss::IndexIVFPQ>(
+      this->quantizer_.get(), dim, param.nlist, param.M, param.bitsPerCode, this->metric_type_);
   }
 
   void save(const std::string& file) const override

diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
@@ -50,7 +50,7 @@ void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
 {
   parse_base_build_param<T>(conf, param);
-  param.M_ratio = conf.at("M_ratio");
+  param.M = conf.at("M");
   if (conf.contains("usePrecomputed")) {
     param.usePrecomputed = conf.at("usePrecomputed");
   } else {

diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -35,6 +35,9 @@
 #include <faiss/index_io.h>
 #include <omp.h>
 
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/stream_view.hpp>
+
 #include <cassert>
 #include <memory>
 #include <stdexcept>
@@ -84,6 +87,7 @@ class FaissGpu : public ANN<T> {
   struct SearchParam : public AnnSearchParam {
     int nprobe;
     float refine_ratio = 1.0;
+    auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
   };
 
   struct BuildParam {
@@ -101,6 +105,7 @@ class FaissGpu : public ANN<T> {
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
     RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
     faiss_default_stream_ = gpu_resource_.getDefaultStream(device_);
+    raft::resource::set_cuda_stream(handle_, faiss_default_stream_);
   }
 
   virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); }
@@ -109,6 +114,8 @@ class FaissGpu : public ANN<T> {
 
   virtual void set_search_param(const FaissGpu<T>::AnnSearchParam& param) {}
 
+  void set_search_dataset(const T* dataset, size_t nrow) override { dataset_ = dataset; }
+
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
   void search(const T* queries,
@@ -123,7 +130,7 @@ class FaissGpu : public ANN<T> {
     AlgoProperty property;
     // to enable building big dataset which is larger than GPU memory
     property.dataset_memory_type = MemoryType::Host;
-    property.query_memory_type   = MemoryType::Device;
+    property.query_memory_type   = MemoryType::Host;
     return property;
   }
 
@@ -142,14 +149,17 @@ class FaissGpu : public ANN<T> {
 
   mutable faiss::gpu::StandardGpuResources gpu_resource_;
   std::unique_ptr<faiss::gpu::GpuIndex> index_;
-  std::unique_ptr<faiss::IndexRefineFlat> index_refine_;
+  std::unique_ptr<faiss::IndexRefineFlat> index_refine_{nullptr};
   faiss::MetricType metric_type_;
   int nlist_;
   int device_;
   cudaEvent_t sync_{nullptr};
   cudaStream_t faiss_default_stream_{nullptr};
   double training_sample_fraction_;
   std::unique_ptr<faiss::SearchParameters> search_params_;
+  const T* dataset_;
+  raft::device_resources handle_;
+  float refine_ratio_ = 1.0;
 };
 
 template <typename T>
@@ -194,7 +204,25 @@ void FaissGpu<T>::search(const T* queries,
 {
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
-  index_->search(batch_size, queries, k, distances, reinterpret_cast<faiss::idx_t*>(neighbors));
+
+  if (this->refine_ratio_ > 1.0) {
+    // TODO: FAISS changed their search APIs to accept the search parameters as a struct object
+    // but their refine API doesn't allow the struct to be passed in. Once this is fixed, we
+    // need to re-enable refinement below
+    // index_refine_->search(batch_size, queries, k, distances,
+    // reinterpret_cast<faiss::idx_t*>(neighbors), this->search_params_.get()); Related FAISS issue:
+    // https://github.com/facebookresearch/faiss/issues/3118
+    throw std::runtime_error(
+      "FAISS doesn't support refinement in their new APIs so this feature is disabled in the "
+      "benchmarks for the time being.");
+  } else {
+    index_->search(batch_size,
+                   queries,
+                   k,
+                   distances,
+                   reinterpret_cast<faiss::idx_t*>(neighbors),
+                   this->search_params_.get());
+  }
   stream_wait(stream);
 }
 
@@ -217,7 +245,13 @@ void FaissGpu<T>::load_(const std::string& file)
 
   std::unique_ptr<CpuIndex> cpu_index(dynamic_cast<CpuIndex*>(faiss::read_index(file.c_str())));
   assert(cpu_index);
-  dynamic_cast<GpuIndex*>(index_.get())->copyFrom(cpu_index.get());
+
+  try {
+    dynamic_cast<GpuIndex*>(index_.get())->copyFrom(cpu_index.get());
+
+  } catch (const std::exception& e) {
+    std::cout << "Error loading index file: " << std::string(e.what()) << std::endl;
+  }
 }
 
 template <typename T>
@@ -242,11 +276,7 @@ class FaissGpuIVFFlat : public FaissGpu<T> {
     faiss::IVFSearchParameters faiss_search_params;
     faiss_search_params.nprobe = nprobe;
     this->search_params_       = std::make_unique<faiss::IVFSearchParameters>(faiss_search_params);
-
-    if (search_param.refine_ratio > 1.0) {
-      this->index_refine_ = std::make_unique<faiss::IndexRefineFlat>(this->index_.get());
-      this->index_refine_.get()->k_factor = search_param.refine_ratio;
-    }
+    this->refine_ratio_        = search_param.refine_ratio;
   }
 
   void save(const std::string& file) const override
@@ -263,7 +293,7 @@ template <typename T>
 class FaissGpuIVFPQ : public FaissGpu<T> {
  public:
   struct BuildParam : public FaissGpu<T>::BuildParam {
-    int M_ratio;
+    int M;
     bool useFloat16;
     bool usePrecomputed;
   };
@@ -279,7 +309,7 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
       std::make_unique<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
                                                   dim,
                                                   param.nlist,
-                                                  dim / param.M_ratio,
+                                                  param.M,
                                                   8,  // FAISS only supports bitsPerCode=8
                                                   this->metric_type_,
                                                   config);
@@ -290,14 +320,15 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
     auto search_param = dynamic_cast<const typename FaissGpu<T>::SearchParam&>(param);
     int nprobe        = search_param.nprobe;
     assert(nprobe <= nlist_);
-
+    this->refine_ratio_ = search_param.refine_ratio;
     faiss::IVFPQSearchParameters faiss_search_params;
     faiss_search_params.nprobe = nprobe;
 
     this->search_params_ = std::make_unique<faiss::IVFPQSearchParameters>(faiss_search_params);
 
     if (search_param.refine_ratio > 1.0) {
-      this->index_refine_ = std::make_unique<faiss::IndexRefineFlat>(this->index_.get());
+      this->index_refine_ =
+        std::make_unique<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
       this->index_refine_.get()->k_factor = search_param.refine_ratio;
     }
   }
@@ -349,9 +380,10 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
     faiss_search_params.nprobe = nprobe;
 
     this->search_params_ = std::make_unique<faiss::IVFSearchParameters>(faiss_search_params);
-
+    this->refine_ratio_  = search_param.refine_ratio;
     if (search_param.refine_ratio > 1.0) {
-      this->index_refine_ = std::make_unique<faiss::IndexRefineFlat>(this->index_.get());
+      this->index_refine_ =
+        std::make_unique<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
       this->index_refine_.get()->k_factor = search_param.refine_ratio;
     }
   }

diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
@@ -91,7 +91,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `M_ratio`        | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Ratio of numbeer of chunks or subquantizers for each vector. Computed by `dims` / `M_ratio`                                                                                         |
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`         | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                          |
 | `useFloat16`     | `build_param`  | N        | Boolean. Default=`false`         | `false`  | Use half-precision floats for clustering step.                                                                                                                                    |
-| `numProbes`      | `search_params` | Y        | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
+| `nprobe`         | `search_params` | Y        | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
 | `refine_ratio`   | `search_params` | N| Positive Number >=0          | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.             |
 
 ### `faiss_cpu_flat`
@@ -118,16 +118,16 @@ Use FAISS IVF-Flat index on CPU
 
 Use FAISS IVF-PQ index on CPU
 
-| Parameter       | Type           | Required | Data Type                          | Default | Description                                                                                                                                                                   |
-|-----------------|----------------|----------|------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlist`         | `build_param`  | Y        | Positive Integer >0                |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `ratio`         | `build_param`  | N        | Positive Integer >0                | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                        |
-| `M`             | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Number of chunks or subquantizers for each vector.                                                                                                                            |
+| Parameter        | Type           | Required | Data Type                          | Default | Description                                                                                                                                                                   |
+|------------------|----------------|----------|------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nlist`          | `build_param`  | Y        | Positive Integer >0                |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`          | `build_param`  | N        | Positive Integer >0                | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                        |
+| `M`              | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Number of chunks or subquantizers for each vector.                                                                                                                            |
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`           | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                      |
-| `bitsPerCode`   | `build_param`  | N        | Positive Integer [4-8]             | 8       | Number of bits to use for each code.                                                                                                                                          |
-| `numProbes`     | `search_params` | Y        | Positive Integer >0                |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                   |
-| `refine_ratio`  | `search_params` | N| Positive Number >=0                | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.         |
-| `numThreads`    | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
+| `bitsPerCode`    | `build_param`  | N        | Positive Integer [4-8]             | 8       | Number of bits to use for each code.                                                                                                                                          |
+| `nprobe`         | `search_params` | Y        | Positive Integer >0                |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                   |
+| `refine_ratio`   | `search_params` | N| Positive Number >=0                | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.         |
+| `numThreads`     | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
 
 
 ## HNSW

diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
@@ -425,6 +425,7 @@ A single configuration will often define a set of algorithms, with associated in
   base_file: sift-128-euclidean/base.fbin
   query_file: sift-128-euclidean/query.fbin
   groundtruth_neighbors_file: sift-128-euclidean/groundtruth.neighbors.ibin
+  dims: 128
   distance: euclidean
 ```