diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml
index 355ea640ff..699e485d0b 100644
--- a/conda/recipes/raft-ann-bench-cpu/meta.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml
@@ -57,6 +57,7 @@ requirements:
     - matplotlib
     - python
     - pyyaml
+    - benchmark
 
 about:
   home: https://rapids.ai/
diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
index 882ff6cc49..a5c20b0a28 100644
--- a/conda/recipes/raft-ann-bench/meta.yaml
+++ b/conda/recipes/raft-ann-bench/meta.yaml
@@ -90,7 +90,11 @@ requirements:
     - libfaiss {{ faiss_version }}
     {% endif %}
     - h5py {{ h5py_version }}
-
+    - benchmark
+    - glog {{ glog_version }}
+    - matplotlib
+    - python
+    - pyyaml
 about:
   home: https://rapids.ai/
   license: Apache-2.0
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
index 231154ccfd..56885cce5c 100644
--- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -30,19 +30,27 @@
 
 namespace raft::bench::ann {
 
+template <typename T>
+void parse_base_build_param(const nlohmann::json& conf,
+                            typename raft::bench::ann::FaissGpu<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+  if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); }
+}
+
 template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
 {
-  param.nlist = conf.at("nlist");
+  parse_base_build_param<T>(conf, param);
 }
 
 template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
 {
-  param.nlist = conf.at("nlist");
-  param.M     = conf.at("M");
+  parse_base_build_param<T>(conf, param);
+  param.M = conf.at("M");
   if (conf.contains("usePrecomputed")) {
     param.usePrecomputed = conf.at("usePrecomputed");
   } else {
@@ -59,7 +67,7 @@ template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFSQ<T>::BuildParam& param)
 {
-  param.nlist          = conf.at("nlist");
+  parse_base_build_param<T>(conf, param);
   param.quantizer_type = conf.at("quantizer_type");
 }
 
diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
index ec80e6cbfd..672c685b1f 100644
--- a/cpp/bench/ann/src/faiss/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -18,6 +18,7 @@
 
 #include "../common/ann_types.hpp"
 
+#include <raft/core/logger.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <faiss/IndexFlat.h>
@@ -85,7 +86,23 @@ class FaissGpu : public ANN<T> {
     float refine_ratio = 1.0;
   };
 
-  FaissGpu(Metric metric, int dim, int nlist);
+  struct BuildParam {
+    int nlist = 1;
+    int ratio = 2;
+  };
+
+  FaissGpu(Metric metric, int dim, const BuildParam& param)
+    : ANN<T>(metric, dim),
+      metric_type_(parse_metric_type(metric)),
+      nlist_{param.nlist},
+      training_sample_fraction_{1.0 / double(param.ratio)}
+  {
+    static_assert(std::is_same_v<T, float>, "faiss support only float type");
+    RAFT_CUDA_TRY(cudaGetDevice(&device_));
+    RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
+    faiss_default_stream_ = gpu_resource_.getDefaultStream(device_);
+  }
+
   virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); }
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final;
@@ -131,23 +148,35 @@ class FaissGpu : public ANN<T> {
   int device_;
   cudaEvent_t sync_{nullptr};
   cudaStream_t faiss_default_stream_{nullptr};
+  double training_sample_fraction_;
 };
 
-template <typename T>
-FaissGpu<T>::FaissGpu(Metric metric, int dim, int nlist)
-  : ANN<T>(metric, dim), metric_type_(parse_metric_type(metric)), nlist_(nlist)
-{
-  static_assert(std::is_same_v<T, float>, "faiss support only float type");
-  RAFT_CUDA_TRY(cudaGetDevice(&device_));
-  RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
-  faiss_default_stream_ = gpu_resource_.getDefaultStream(device_);
-}
-
 template <typename T>
 void FaissGpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
   OmpSingleThreadScope omp_single_thread;
-
+  auto index_ivf = dynamic_cast<faiss::gpu::GpuIndexIVF*>(index_.get());
+  if (index_ivf != nullptr) {
+    // set the min/max training size for clustering to use the whole provided training set.
+    double trainset_size       = training_sample_fraction_ * static_cast<double>(nrow);
+    double points_per_centroid = trainset_size / static_cast<double>(nlist_);
+    int max_ppc                = std::ceil(points_per_centroid);
+    int min_ppc                = std::floor(points_per_centroid);
+    if (min_ppc < index_ivf->cp.min_points_per_centroid) {
+      RAFT_LOG_WARN(
+        "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d "
+        "points per cluster (n_lists = %d). This is smaller than the FAISS default "
+        "min_points_per_centroid = %d.",
+        static_cast<size_t>(trainset_size),
+        nrow,
+        training_sample_fraction_,
+        min_ppc,
+        nlist_,
+        index_ivf->cp.min_points_per_centroid);
+    }
+    index_ivf->cp.max_points_per_centroid = max_ppc;
+    index_ivf->cp.min_points_per_centroid = min_ppc;
+  }
   index_->train(nrow, dataset);  // faiss::gpu::GpuIndexFlat::train() will do nothing
   assert(index_->is_trained);
   index_->add(nrow, dataset);
@@ -208,12 +237,9 @@ void FaissGpu<T>::load_(const std::string& file)
 template <typename T>
 class FaissGpuIVFFlat : public FaissGpu<T> {
  public:
-  struct BuildParam {
-    int nlist;
-  };
+  using typename FaissGpu<T>::BuildParam;
 
-  FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param)
-    : FaissGpu<T>(metric, dim, param.nlist)
+  FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
   {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = this->device_;
@@ -234,15 +260,13 @@ class FaissGpuIVFFlat : public FaissGpu<T> {
 template <typename T>
 class FaissGpuIVFPQ : public FaissGpu<T> {
  public:
-  struct BuildParam {
-    int nlist;
+  struct BuildParam : public FaissGpu<T>::BuildParam {
     int M;
     bool useFloat16;
     bool usePrecomputed;
   };
 
-  FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param)
-    : FaissGpu<T>(metric, dim, param.nlist)
+  FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
   {
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.useFloat16LookupTables = param.useFloat16;
@@ -271,13 +295,11 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
 template <typename T>
 class FaissGpuIVFSQ : public FaissGpu<T> {
  public:
-  struct BuildParam {
-    int nlist;
+  struct BuildParam : public FaissGpu<T>::BuildParam {
     std::string quantizer_type;
   };
 
-  FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param)
-    : FaissGpu<T>(metric, dim, param.nlist)
+  FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
   {
     faiss::ScalarQuantizer::QuantizerType qtype;
     if (param.quantizer_type == "fp16") {
@@ -310,7 +332,8 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
 template <typename T>
 class FaissGpuFlat : public FaissGpu<T> {
  public:
-  FaissGpuFlat(Metric metric, int dim) : FaissGpu<T>(metric, dim, 0)
+  FaissGpuFlat(Metric metric, int dim)
+    : FaissGpu<T>(metric, dim, typename FaissGpu<T>::BuildParam{})
   {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = this->device_;
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index aa25d1532f..7ba381ab0a 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -58,10 +58,7 @@ void parse_build_param(const nlohmann::json& conf,
 {
   param.n_lists = conf.at("nlist");
   if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) {
-    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
-    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
-  }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
 }
 
 template <typename T, typename IdxT>
@@ -82,6 +79,17 @@ void parse_build_param(const nlohmann::json& conf,
   if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
   if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
   if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+  if (conf.contains("codebook_kind")) {
+    std::string kind = conf.at("codebook_kind");
+    if (kind == "cluster") {
+      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER;
+    } else if (kind == "subspace") {
+      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
+    } else {
+      throw std::runtime_error("codebook_kind: '" + kind +
+                               "', should be either 'cluster' or 'subspace'");
+    }
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 1554c1f016..8f1e43a706 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -63,9 +63,14 @@ class RaftIvfPQ : public ANN<T> {
     rmm::mr::set_current_device_resource(&mr_);
     index_params_.metric = parse_metric_type(metric);
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
+    RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
   }
 
-  ~RaftIvfPQ() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); }
+  ~RaftIvfPQ() noexcept
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_));
+    rmm::mr::set_current_device_resource(mr_.get_upstream());
+  }
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
@@ -96,6 +101,7 @@ class RaftIvfPQ : public ANN<T> {
   // `mr_` must go first to make sure it dies last
   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
   raft::device_resources handle_;
+  cudaEvent_t sync_{nullptr};
   BuildParam index_params_;
   raft::neighbors::ivf_pq::search_params search_params_;
   std::optional<raft::neighbors::ivf_pq::index<IdxT>> index_;
@@ -103,6 +109,12 @@ class RaftIvfPQ : public ANN<T> {
   int dimension_;
   float refine_ratio_ = 1.0;
   raft::device_matrix_view<const T, IdxT> dataset_;
+
+  void stream_wait(cudaStream_t stream) const
+  {
+    RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_)));
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_));
+  }
 };
 
 template <typename T, typename IdxT>
@@ -121,12 +133,12 @@ void RaftIvfPQ<T, IdxT>::load(const std::string& file)
 }
 
 template <typename T, typename IdxT>
-void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
   auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), dim_);
 
   index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v));
-  return;
+  stream_wait(stream);
 }
 
 template <typename T, typename IdxT>
@@ -176,16 +188,14 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
                                        neighbors_v,
                                        distances_v,
                                        index_->metric());
+      stream_wait(stream);  // RAFT stream -> bench stream
     } else {
       auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
       auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
       auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
       auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
 
-      raft::copy(queries_host.data_handle(),
-                 queries,
-                 queries_host.size(),
-                 resource::get_cuda_stream(handle_));
+      raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream);
       raft::copy(candidates_host.data_handle(),
                  candidates.data_handle(),
                  candidates_host.size(),
@@ -194,6 +204,10 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
       auto dataset_v = raft::make_host_matrix_view<const T, IdxT>(
         dataset_.data_handle(), dataset_.extent(0), dataset_.extent(1));
 
+      // wait for the queries to copy to host in 'stream` and for IVF-PQ::search to finish
+      RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_)));
+      RAFT_CUDA_TRY(cudaEventRecord(sync_, stream));
+      RAFT_CUDA_TRY(cudaEventSynchronize(sync_));
       raft::runtime::neighbors::refine(handle_,
                                        dataset_v,
                                        queries_host.view(),
@@ -202,14 +216,8 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
                                        distances_host.view(),
                                        index_->metric());
 
-      raft::copy(neighbors,
-                 (size_t*)neighbors_host.data_handle(),
-                 neighbors_host.size(),
-                 resource::get_cuda_stream(handle_));
-      raft::copy(distances,
-                 distances_host.data_handle(),
-                 distances_host.size(),
-                 resource::get_cuda_stream(handle_));
+      raft::copy(neighbors, (size_t*)neighbors_host.data_handle(), neighbors_host.size(), stream);
+      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
     }
   } else {
     auto queries_v =
@@ -219,8 +227,7 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
 
     raft::runtime::neighbors::ivf_pq::search(
       handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
+    stream_wait(stream);  // RAFT stream -> bench stream
   }
-  resource::sync_stream(handle_);
-  return;
 }
 }  // namespace raft::bench::ann
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index 020c2d5ad9..ca8ffa5e18 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -1,6 +1,6 @@
 # ANN Benchmarks Parameter Tuning Guide
 
-This guide outlines the various parameter settings that can be specified in [RAFT ANN Benchmark](raft_ann_benchmarks.md) json configuration files and explains the impact they have on corresponding algorithms to help inform their settings for benchmarking across desired levels of recall. 
+This guide outlines the various parameter settings that can be specified in [RAFT ANN Benchmark](raft_ann_benchmarks.md) json configuration files and explains the impact they have on corresponding algorithms to help inform their settings for benchmarking across desired levels of recall.
 
 
 ## RAFT Indexes
@@ -15,8 +15,8 @@ IVF-flat is a simple algorithm which won't save any space, but it provides compe
 |-----------|------------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `nlists`  | `build_param`    | Y        | Positive Integer >0 |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
 | `niter`   | `build_param`    | N        | Positive Integer >0 | 20      | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `ratio`   | `build_param`     | N        | Positive Float >0   | 0.5     | Fraction of the number of training points which should be used to train the clusters.                                                                                             |
-| `nprobe`  | `search_params` | Y        |  Positive Integer >0 |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
+| `ratio`   | `build_param`    | N        | Positive Integer >0 | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
+| `nprobe`  | `search_params`  | Y        | Positive Integer >0 |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
 
 
 ### `raft_ivf_pq`
@@ -27,8 +27,10 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 |-------------------------|----------------|---|------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `nlists`                | `build_param`  | Y | Positive Integer >0          |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
 | `niter`                 | `build_param`  | N | Positive Integer >0          | 20      | Number of k-means iterations to use when training the clusters.                                                                                                                 |
+| `ratio`                 | `build_param`  | N | Positive Integer >0          | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
 | `pq_dim`                | `build_param`  | N | Positive Integer. Multiple of 8. | 0       | Dimensionality of the vector after product quantization. When 0, a heuristic is used to select this value. `pq_dim` * `pq_bits` must be a multiple of 8.                        |
 | `pq_bits`               | `build_param`  | N | Positive Integer. [4-8]      | 8       | Bit length of the vector element after quantization.                                                                                                                            |
+| `codebook_kind`         | `build_param`  | N | ["cluster", "subspace"]      | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
 | `nprobe`                | `search_params` | Y | Positive Integer >0          |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
 | `internalDistanceDtype` | `search_params` | N | [`float`, `half`]            | `half`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
 | `smemLutDtype`          | `search_params` | N | [`float`, `half`, `fp8`]     | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
@@ -58,7 +60,8 @@ IVF-flat is a simple algorithm which won't save any space, but it provides compe
 
 | Parameter | Type           | Required | Data Type           | Default | Description                                                                                                                                                                       |
 |-----------|----------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`  | `build_param`  | Y        | Positive Integer >0 | | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `nlists`  | `build_param`  | Y        | Positive Integer >0 |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`   | `build_param`  | N        | Positive Integer >0 | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
 | `nprobe`  | `search_params` | Y        | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
 
 ### `faiss_gpu_ivf_pq`
@@ -68,6 +71,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | Parameter        | Type           | Required | Data Type                        | Default | Description                                                                                                                                                                       |
 |------------------|----------------|----------|----------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `nlists`         | `build_param`  | Y        | Positive Integer >0              |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`          | `build_param`  | N        | Positive Integer >0              | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
 | `M`              | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Number of chunks or subquantizers for each vector.                                                                                                                                |
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`         | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                          |
 | `useFloat16`     | `build_param`  | N        | Boolean. Default=`false`         | `false`  | Use half-precision floats for clustering step.                                                                                                                                    |
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index e0c02bb7eb..af0b040d34 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -8,7 +8,7 @@ The easiest way to install these benchmarks is through conda. We provide package
 
 ```bash
 
-mamba env create --name raft_ann_benchmarks
+mamba create --name raft_ann_benchmarks
 conda activate raft_ann_benchmarks
 
 # to install GPU package:
@@ -25,7 +25,7 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm
 ## Running the benchmarks
 
 ### Usage
-There are 3 general steps to running the benchmarks and vizualizing the results:
+There are 4 general steps to running the benchmarks and visualizing the results:
 1. Prepare Dataset
 2. Build Index and Search Index
 3. Data Export
@@ -39,7 +39,7 @@ expected to be defined to run these scripts; this variable holds the directory w
 
 ### End-to-end example: Million-scale
 
-The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the RAPIDS_DATASET_ROOT_DIR environment variable if defined, otherwise a datasets subfolder from where the script is being called:
+The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the RAPIDS_DATASET_ROOT_DIR environment variable if defined, otherwise a datasets sub-folder from where the script is being called:
 
 ```bash
 
@@ -56,7 +56,7 @@ python -m raft-ann-bench.data_export --dataset deep-image-96-inner
 python -m raft-ann-bench.plot --dataset deep-image-96-inner
 ```
 
-Configuration files already exist for the following list of the million-scale datasets. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `python/raft-ann-bench/src/raft-ann-bench/conf`.
+Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `python/raft-ann-bench/src/raft-ann-bench/run/conf`.
 - `deep-image-96-angular`
 - `fashion-mnist-784-euclidean`
 - `glove-50-angular`
@@ -80,17 +80,17 @@ mkdir -p datasets/deep-1B
 # (1) prepare dataset
 # download manually "Ground Truth" file of "Yandex DEEP"
 # suppose the file name is deep_new_groundtruth.public.10K.bin
-python python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
+python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
 # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
 # (2) build and search index
-python python -m raft-ann-bench.run --dataset deep-1B
+python -m raft-ann-bench.run --dataset deep-1B
 
 # (3) export data
-python python -m raft-ann-bench.data_export --dataset deep-1B
+python -m raft-ann-bench.data_export --dataset deep-1B
 
 # (4) plot results
-python python -m raft-ann-bench.plot --dataset deep-1B
+python -m raft-ann-bench.plot --dataset deep-1B
 ```
 
 The usage of `python -m raft-ann-bench.split-groundtruth` is:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index d5a65ddfb7..347c68c477 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -145,7 +145,7 @@ def main():
 
     # Read list of allowed algorithms
     try:
-        import pylibraft  # noqa: F401
+        import rmm  # noqa: F401
 
         gpu_present = True
     except ImportError: